Dm_code

3 minute read


title: “ ETL project “ date : 2019-10-12 tags : [Data cleaning,Schema creation,Machine learning] header: image: “/images/light.jpg” excerpt: “Data cleaning,Schema creation,Machine learning” mathjax: “true”


Extract and Preprocess the dataset


import petl as etl

table = (
       etl
        .fromcsv('C:/Users/ali/Desktop/Data Management/congress-terms3.csv')    
        .convert('chamber', 'upper')  ###   For Consistency  
         ## For confirmity 
        .convert('state', {'AL': 'Alabama', 'AK': 'Alaska','AZ':'Arizona',    
                  'AR':'Arkansas','CA':'California','CO':'Colorado' ,'CT':'Connecticut',          # To give the complete name of field value 
                           'DE':'Delaware','FL':'Florida','GA':'Georgia','HI':'Hawaii',
                           'ID':'Idaho','IL':'Illinois','IN':' Indiana','IA':'Iowa','KS':'Kansas',
                           'KY':'Kentucky','LA':'Louisiana','ME':'Maine','MD':'Maryland',
                  'MA':'Massachusetts','MI':'Michigan','MN': 'Minnesota','MS':'Mississippi',
                           'MO': 'Missouri','MT':'Montana','NE':'Nebraska','NV':'Nevada','NH':'New Hampshire',
                           'NJ':' New Jersey','NM':'New Mexico','NY':'New York','NC':'North Carolina',
                           'ND':'North Dakota','OH':'Ohio','OK':'Oklahoma','OR':'Oregon',
                           'PA':'Pennsylvania','RI':'Rhode Island','SC':'South Carolina','SD':'South Dakota',
                           'TN':' Tennessee',   'TX':'Texas' ,  'UT':'Utah','VT':'Vermont',
                           'VA':'Virginia','WA':' Washington','WV':'West Virginia','WI':'Wisconsin',
                           'WY':'Wyoming'             
                }
                
                )
       .convert('incumbent',{'Yes':'1','No': '2'}) #  Data Enrichment 
       
       .convert('birthday', 'MM/DD/YYYY')  # For consistency 
       
 )




Filtering the data in fact and dimension table

# creation of dimension table of party
table1 = etl.cut(table,'Party_id','party')
#creation of dimension table of state 
table2 = etl.cut(table, 'State_id','state') 
#Creation of dimension table of chamber
table3 = etl.cut(table, 'Chamber_id','chamber')
# Creation of dimension table of Member of congress
table4 = etl.cut(table, 'Bioguide_id','bioguide','firstname','middlename','lastname','suffix')
# creation of fact table
table5 = etl.cutout(table, 'bioguide','congress','firstname','middlename','lastname','suffix','termstart','birthday','chamber','state','party') 



Taking the distict values to make the primary key or id for the dimesnion table and fact table.

table6 =etl.distinct(table1,'Party_id')     
table7 =etl.distinct(table2,'State_id')
table8 =etl.distinct(table4,'Bioguide_id')
table9 =etl.distinct(table3,'Chamber_id')
table10=etl.distinct(table5,'Bioguide_id')
table11=etl.rename(table10,'Members_id ','Members_id')


 

Creating connection with database & Creating database

import pymysql

### Creating connection with DB and database name  

connection = pymysql.connect(host = "127.0.0.1", user = "root", password = "Ali786mu",database='etlprocess') 
cursor = connection.cursor()

cursor.execute("CREATE DATABASE etlprocess7")  ###creating DATABASE NAME

cursor.execute("USE etlprocess7")     ## Unsing database

creation of dimension and fact table in mysql database

### Creation of the dimension table 1

cursor.execute("""CREATE TABLE tbl_state        
(State_id int NOT NULL PRIMARY KEY,
state varchar(25) NULL)""")
### Creation of the dimension table 2
cursor.execute("""CREATE TABLE tbl_chamber    
(Chamber_id int NOT NULL PRIMARY KEY,
chamber varchar(25) NULL)""")
###  Creation of the dimesnion table 3
cursor.execute("""CREATE TABLE tbl_party      
(Party_id int NOT NULL PRIMARY KEY,
party varchar(25) NULL)""")
### Creation of the dimesnion table 4
cursor.execute("""CREATE TABLE tbl_member    
(Bioguide_id int NOT NULL PRIMARY KEY,
 bioguide varchar(25) NULL,
 firstname varchar(25)NULL,
 middlename varchar(25) NULL,
 lastname varchar(25) NULL,
 suffix varchar(25) NULL)
 """)
### Creation of the Fact table 
cursor.execute("""CREATE TABLE tbl_Fact      
(Members_id int NOT NULL PRIMARY KEY,

Bioguide_id int NOT NULL,
State_id int NOT NULL,
Chamber_id int NOT NULL,
Party_id int NOT NULL,
incumbent int,
age int , 
 
FOREIGN KEY(Bioguide_id) REFERENCES tbl_member(Bioguide_id),
FOREIGN KEY(State_id) REFERENCES tbl_state(State_id),
FOREIGN KEY(Chamber_id) REFERENCES tbl_chamber(Chamber_id),
FOREIGN KEY(Party_id) REFERENCES tbl_party(Party_id))""")

connection.cursor().execute('SET SQL_MODE=ANSI_QUOTES')

To load the data into database

etl.todb(table6, cursor, 'tbl_party')  
etl.todb(table7, cursor, 'tbl_state')  
etl.todb(table8, cursor, 'tbl_member')
etl.todb(table9, cursor, 'tbl_chamber')
etl.todb(table11,cursor, 'tbl_fact')

To get the data from database based on the requirement

import pandas as pd
#conn = sqlite3.connect("flights.db")
df = pd.read_sql_query("SELECT age,incumbent,chamber_id,state_id,tbl_party.party from tbl_fact INNER JOIN tbl_party ON tbl_fact.party_id=tbl_party.party_id", connection)

Defining the x and y data field ::::

X = df.iloc[:, :-1].values
y = df.iloc[:, 4].values

Devide the data in to 80:20 ratio for train and test dataset

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

KNN Classifier algorithm

from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=10)
classifier.fit(X_train, y_train)
##### Predicting y with x values 
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

y_pred = classifier.predict(X_test)

for accuracy check

print ("Accuracy of KNN is {}".format(accuracy_score(y_test,y_pred)*100))
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Evaluation of the error depend on the K values

import numpy as np

error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))


Plotting the graph to compare the eeror with K values

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
 markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)

Decision tree entropy classification

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier 
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state = 100)
clf_entropy.fit(X_train, y_train)

y_pred_d = clf_entropy.predict(X_test)

print(confusion_matrix(y_test, y_pred_d))
print(classification_report(y_test, y_pred_d))


print("Accuracy of DTE is {}".format(accuracy_score(y_test,y_pred_d)*100))

Updated: