-
Notifications
You must be signed in to change notification settings - Fork 0
/
transform_df.py
38 lines (27 loc) · 887 Bytes
/
transform_df.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
# -*- coding: utf-8 -*-
"""transform_df.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1q-f8ZWASQuX2gzqkXC3VkweQbKEJYCi-
"""
def transform(df: DataFrame) -> DataFrame:
'''
So we can write default code block that can work despite the content of a datasetcleans data
This could be
- remooving duplicates
- droping empty cells
- removing outliers
'''
# Removing duplicates
df.drop_duplicates()
# Droping null values
df.dropna(inplace=True)
# Removing outliers
def remove_outliers(df,columns,n_std):
for col in columns:
print('Working on column: {}'.format(col))
mean = df[col].mean()
sd = df[col].std()
df = df[(df[col] <= mean+(n_std*sd))]
return df
return df