A.COLLECT DATA TO DATAFRAME
-Read from file
import pandas as pd flights = pd.read_csv('flights.csv',header=0) flights.head(5)
import pandas as pd planes = pd.read_csv('plane.csv',header=0) planes.head(5)
-Build data frame
import pandas as pd
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy','Jason'], 'last_name': ['Miller', 'Jacobson', ".", 'Milner', 'Cooze','Jacob'], 'age': [42, 52, 36, 24, 73,55], 'preTestScore': [4, 24, 31, ".", ".",","], 'postTestScore': ["25,000", "94,000", 57, 62, 70,80]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'preTestScore', 'postTestScore'])
df
-Read from Pandas (read from sklearn)
from sklearn.datasets import load_boston import pandas as pd import numpy as np
data = load_boston() df = pd.DataFrame(data.data, columns=data.feature_names) df['target'] = data['target']
df.head(4)
B. Transform Data
– Pick up specific columns
flight2=flights[['Month','DayofMonth','DayOfWeek','DepTime','ArrTime','UniqueCarrier','FlightNum','TailNum','ElapsedTime','AirTime','ArrDelay','DepDelay','Origin','Dest']] flight2.head(5)
– Combine columns in DataFrame
flight2=flights[['Month','DayofMonth','DayOfWeek','DepTime','ArrTime','UniqueCarrier','FlightNum','TailNum','ElapsedTime','AirTime','ArrDelay','DepDelay','Origin','Dest']] flight2.head(5)
– filter dataframe
flight2[flight2['UniqueCarrier']=='WN'].head(5)
– Sort dataframe
flight2.sort_values(by='flightDate').head(4)
– Group By
planes.groupby(['manufacturer']).size()
flight2.groupby(['UniqueCarrier', 'Origin','Dest']).agg({'ArrDelay': ['sum'], 'DepDelay': ['sum']}).head(5)
flight2.groupby(['UniqueCarrier', 'Origin','Dest'])['ArrDelay','DepDelay'].sum().head(5)
– Reset Index
flight3=flight2.groupby(['UniqueCarrier', 'Origin','Dest'])['ArrDelay','DepDelay'].max().reset_index() flight3.head(5)
flight4=flight2.groupby(['UniqueCarrier', 'Origin','Dest']).agg({'ArrDelay': ['sum'], 'DepDelay': ['sum']}) for col in flight4.columns: print(col)
flights2=flights[['FlightNum','TailNum']].drop_duplicates()flights2['tailNum']=flights2['TailNum']
flights2.head(5)
C. Enrichment
– Merge two dataframes (left join, right join, inner join)
planes2=planes[['tailnum','type','manufacturer']] planes3=planes[['tailnum','issue_date','model','status']] pd.merge(planes2,planes3, left_on='tailnum', right_on='tailnum').head(5)
pd.merge(planes2,planes3, left_on='tailnum', right_on='tailnum',how='left').head(5)
D- Use function
rectangles = [ { 'height': 60, 'width': 20 }, { 'height': 80, 'width': 9 }, { 'height': 30.4, 'width': 4 }, { 'height': 35.4, 'width': 5 } ] rectangles_df = pd.DataFrame(rectangles)
rectangles_df
def calculate_area(row): return row['height'] * row['width'] rectangles_df.apply(calculate_area, axis=1)
rectangles_df['area'] = rectangles_df.apply(calculate_area, axis=1)
rectangles_df
E- Label Types
- LabelEncoder
LabelEncoder: change the data to the sequence and us it. for example in [‘A’,’A’,’C’,’E’,’C’] => [1,1,2,3,2]
from sklearn import preprocessing
df['new_first_name'] = preprocessing.LabelEncoder().fit_transform(raw_data['first_name'])
df[['new_first_name', 'first_name']].head(5)
df.head(6)
- MultiLabelBinarizer
Change the specific rows and get the features and use it to change to columns
#Creating an MultiLabel Array
multilabel_feature = [("New Delhi", "New York"),
("New York", "Sydney", "Hyderabad", "Bangalore"),
("Hyderabad", "Sydney", "Chennai"),
("Chennai", "New Delhi", "Bangalore"),
("Bangalore", "Chennai","Iraq")]
from sklearn.preprocessing import MultiLabelBinarizer
multilabelbinarizer = MultiLabelBinarizer()
multilabel_encoded_results = multilabelbinarizer.fit_transform(multilabel_feature)
df_multilabel_data = pd.DataFrame(multilabel_encoded_results, columns=multilabelbinarizer.classes_)
df_multilabel_data.head()
- LabelBinarizer
Change the value to binary for example [‘A’,’A’,’C’,’D’] => [001,001,010,100]
import pandas as pd
url = 'http://bit.ly/kaggletrain'
train = pd.read_csv(url)
train.head(5)
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
lb = LabelBinarizer()
train["new_sex"]=lb.fit_transform(train["Sex"])
train.head(5)