# Data Preprocessing

```{note}
To apply deep learning in the wild we must extract messy data stored in arbitrary formats, and preprocessing it to suit our needs. Fortunately, the pandas library can do much of the heavy lifting.
```

## Reading the dataset

In [1]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

In [2]:
import pandas as pd

# pandas replaced all CSV entries with value NA with a special NaN (not a number) value
data = pd.read_csv(data_file)
data

Unnamed: 0,NumRooms,RoofType,Price
0,,,127500
1,2.0,,106000
2,4.0,Slate,178100
3,,,140000


## Missing values

In [3]:
# integer-location based indexing
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs

Unnamed: 0,NumRooms,RoofType
0,,
1,2.0,
2,4.0,Slate
3,,


In [4]:
# for categorical input fields, we can treat NaN as a category
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,RoofType_Slate,RoofType_nan
0,,False,True
1,2.0,False,True
2,4.0,True,False
3,,False,True


In [5]:
# for missing numerical values, we can replace the NaN entries with the mean value
inputs = inputs.fillna(inputs.mean())
inputs

Unnamed: 0,NumRooms,RoofType_Slate,RoofType_nan
0,3.0,False,True
1,2.0,False,True
2,4.0,True,False
3,3.0,False,True


## Conversion to the tensor format

In [6]:
import torch
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(targets.to_numpy(dtype=float))
X, y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))