Dm_code
title: “ ETL project “ date : 2019-10-12 tags : [Data cleaning,Schema creation,Machine learning] header: image: “/images/light.jpg” excerpt: “Data cleaning,Schema creation,Machine learning” mathjax: “true”
Extract and Preprocess the dataset
import petl as etl
table = (
etl
.fromcsv('C:/Users/ali/Desktop/Data Management/congress-terms3.csv')
.convert('chamber', 'upper') ### For Consistency
## For confirmity
.convert('state', {'AL': 'Alabama', 'AK': 'Alaska','AZ':'Arizona',
'AR':'Arkansas','CA':'California','CO':'Colorado' ,'CT':'Connecticut', # To give the complete name of field value
'DE':'Delaware','FL':'Florida','GA':'Georgia','HI':'Hawaii',
'ID':'Idaho','IL':'Illinois','IN':' Indiana','IA':'Iowa','KS':'Kansas',
'KY':'Kentucky','LA':'Louisiana','ME':'Maine','MD':'Maryland',
'MA':'Massachusetts','MI':'Michigan','MN': 'Minnesota','MS':'Mississippi',
'MO': 'Missouri','MT':'Montana','NE':'Nebraska','NV':'Nevada','NH':'New Hampshire',
'NJ':' New Jersey','NM':'New Mexico','NY':'New York','NC':'North Carolina',
'ND':'North Dakota','OH':'Ohio','OK':'Oklahoma','OR':'Oregon',
'PA':'Pennsylvania','RI':'Rhode Island','SC':'South Carolina','SD':'South Dakota',
'TN':' Tennessee', 'TX':'Texas' , 'UT':'Utah','VT':'Vermont',
'VA':'Virginia','WA':' Washington','WV':'West Virginia','WI':'Wisconsin',
'WY':'Wyoming'
}
)
.convert('incumbent',{'Yes':'1','No': '2'}) # Data Enrichment
.convert('birthday', 'MM/DD/YYYY') # For consistency
)
Filtering the data in fact and dimension table
# creation of dimension table of party
table1 = etl.cut(table,'Party_id','party')
#creation of dimension table of state
table2 = etl.cut(table, 'State_id','state')
#Creation of dimension table of chamber
table3 = etl.cut(table, 'Chamber_id','chamber')
# Creation of dimension table of Member of congress
table4 = etl.cut(table, 'Bioguide_id','bioguide','firstname','middlename','lastname','suffix')
# creation of fact table
table5 = etl.cutout(table, 'bioguide','congress','firstname','middlename','lastname','suffix','termstart','birthday','chamber','state','party')
Taking the distict values to make the primary key or id for the dimesnion table and fact table.
table6 =etl.distinct(table1,'Party_id')
table7 =etl.distinct(table2,'State_id')
table8 =etl.distinct(table4,'Bioguide_id')
table9 =etl.distinct(table3,'Chamber_id')
table10=etl.distinct(table5,'Bioguide_id')
table11=etl.rename(table10,'Members_id ','Members_id')
Creating connection with database & Creating database
import pymysql
### Creating connection with DB and database name
connection = pymysql.connect(host = "127.0.0.1", user = "root", password = "Ali786mu",database='etlprocess')
cursor = connection.cursor()
cursor.execute("CREATE DATABASE etlprocess7") ###creating DATABASE NAME
cursor.execute("USE etlprocess7") ## Unsing database
creation of dimension and fact table in mysql database
### Creation of the dimension table 1
cursor.execute("""CREATE TABLE tbl_state
(State_id int NOT NULL PRIMARY KEY,
state varchar(25) NULL)""")
### Creation of the dimension table 2
cursor.execute("""CREATE TABLE tbl_chamber
(Chamber_id int NOT NULL PRIMARY KEY,
chamber varchar(25) NULL)""")
### Creation of the dimesnion table 3
cursor.execute("""CREATE TABLE tbl_party
(Party_id int NOT NULL PRIMARY KEY,
party varchar(25) NULL)""")
### Creation of the dimesnion table 4
cursor.execute("""CREATE TABLE tbl_member
(Bioguide_id int NOT NULL PRIMARY KEY,
bioguide varchar(25) NULL,
firstname varchar(25)NULL,
middlename varchar(25) NULL,
lastname varchar(25) NULL,
suffix varchar(25) NULL)
""")
### Creation of the Fact table
cursor.execute("""CREATE TABLE tbl_Fact
(Members_id int NOT NULL PRIMARY KEY,
Bioguide_id int NOT NULL,
State_id int NOT NULL,
Chamber_id int NOT NULL,
Party_id int NOT NULL,
incumbent int,
age int ,
FOREIGN KEY(Bioguide_id) REFERENCES tbl_member(Bioguide_id),
FOREIGN KEY(State_id) REFERENCES tbl_state(State_id),
FOREIGN KEY(Chamber_id) REFERENCES tbl_chamber(Chamber_id),
FOREIGN KEY(Party_id) REFERENCES tbl_party(Party_id))""")
connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES')
To load the data into database
etl.todb(table6, cursor, 'tbl_party')
etl.todb(table7, cursor, 'tbl_state')
etl.todb(table8, cursor, 'tbl_member')
etl.todb(table9, cursor, 'tbl_chamber')
etl.todb(table11,cursor, 'tbl_fact')
To get the data from database based on the requirement
import pandas as pd
#conn = sqlite3.connect("flights.db")
df = pd.read_sql_query("SELECT age,incumbent,chamber_id,state_id,tbl_party.party from tbl_fact INNER JOIN tbl_party ON tbl_fact.party_id=tbl_party.party_id", connection)
Defining the x and y data field ::::
X = df.iloc[:, :-1].values
y = df.iloc[:, 4].values
Devide the data in to 80:20 ratio for train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
KNN Classifier algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10)
classifier.fit(X_train, y_train)
##### Predicting y with x values
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
y_pred = classifier.predict(X_test)
for accuracy check
print ("Accuracy of KNN is {}".format(accuracy_score(y_test,y_pred)*100))
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
Evaluation of the error depend on the K values
import numpy as np
error = []
# Calculating error for K values between 1 and 40
for i in range(1, 40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
pred_i = knn.predict(X_test)
error.append(np.mean(pred_i != y_test))
Plotting the graph to compare the eeror with K values
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
markerfacecolor='blue', markersize=10)
Decision tree entropy classification
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100)
clf_entropy.fit(X_train, y_train)
y_pred_d = clf_entropy.predict(X_test)
print(confusion_matrix(y_test, y_pred_d))
print(classification_report(y_test, y_pred_d))
print("Accuracy of DTE is {}".format(accuracy_score(y_test,y_pred_d)*100))