CSV数据

Note

之前我们使用了Fashion-MNIST数据集和加州房价数据集,它们都是在内存中的ndarray。
本节通过一个示例来展示如何加载与预处理 CSV 格式的数据。

下载数据

from tensorflow import keras

# 文件下载地址
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

# 下载csv文件
train_file_path = keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = keras.utils.get_file("eval.csv", TEST_DATA_URL)
# 查看数据
!head {train_file_path}
survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n

加载数据

从文件中读取 CSV 数据并且创建 dataset。

import tensorflow as tf

def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=12, # 为了示例更容易展示,手动设置较小的值
        label_name='survived',
        na_value="?",
        num_epochs=1,
        ignore_errors=True)
    return dataset


# 对训练数据进行shuffle
train_data = get_dataset(train_file_path).shuffle(500)
test_data = get_dataset(test_file_path)

数据预处理

分类数据

CSV 数据中的有些列是分类的列,也就是说,这些列只能在有限的集合中取值。

import pandas as pd

df = pd.read_csv(train_file_path)
# 获取各列的vocab
for name in ['sex', 'class', 'deck', 'embark_town', 'alone']:
    print(name, df[name].unique())
sex ['male' 'female']
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']
# 各列的vocab
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}
categorical_columns = []
for feature, vocab in CATEGORIES.items():
    # 使用列名和vocab创建一个分类的列,即使用one-hot编码
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

连续数据

连续数据需要标准化。

def process_continuous(mean, data):
    # 标准化连续数据
    data = tf.cast(data, tf.float32) / mean
    return tf.reshape(data, [-1, 1])
# 获取各列的均值
for name in ['age', 'n_siblings_spouses', 'parch', 'fare']:
    print(name, df[name].mean())
age 29.631307814992027
n_siblings_spouses 0.5454545454545454
parch 0.379585326953748
fare 34.385398564593245
# 各列的均值
MEANS = {
    'age' : 29.631308,
    'n_siblings_spouses' : 0.545455,
    'parch' : 0.379585,
    'fare' : 34.385399
}
import functools

numerical_columns = []
for feature in MEANS.keys():
    # 使用normalizer_fn进行预处理
    # functools.partial基于一个函数创建一个可调用对象,把原函数的某些参数固定。
    num_col = tf.feature_column.numeric_column(feature, 
                                               normalizer_fn=
                                               functools.partial(process_continuous, MEANS[feature]))
    numerical_columns.append(num_col)

创建预处理层

合并分类数据和连续数据

preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numerical_columns)
for X, y in test_data:
    # shape is (batch_size, input_dim)
    print(preprocessing_layer(X).shape)
    break
(12, 24)

使用数据

# 加一个预处理层
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid'),
])
# 编译
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])
# 无视WARNING,官方文档也是有WARNING的
model.fit(train_data, epochs=5)
Epoch 1/5
53/53 [==============================] - 0s 1ms/step - loss: 0.3957 - accuracy: 0.8293
Epoch 2/5
53/53 [==============================] - 0s 953us/step - loss: 0.3819 - accuracy: 0.8309
Epoch 3/5
53/53 [==============================] - 0s 1ms/step - loss: 0.3806 - accuracy: 0.8421
Epoch 4/5
53/53 [==============================] - 0s 957us/step - loss: 0.3809 - accuracy: 0.8389
Epoch 5/5
53/53 [==============================] - 0s 1ms/step - loss: 0.3684 - accuracy: 0.8517
<keras.callbacks.History at 0x7ff7ad702b80>