aadams149
/
ppol628-final-project
connected to https://github.com/aadams149/ppol628-final-project.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
            #!/usr/bin/env python
# coding: utf-8
# %%

# %%


import joblib
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import yaml

with open("params.yaml", "r") as fd:
    params = yaml.safe_load(fd)

docs = params["preprocessing"]["max_min_docs"]
ngrams = params['preprocessing']['n_grams']
    
#Load in data
tweets = pd.read_csv('data/tweets.csv')
#Drop tweets not in english
tweets = tweets.loc[tweets['language'] == 'en']
tweets['tweet'] = tweets['tweet'].str.replace(r'http\S+', '')
tweets = tweets.loc[tweets['tweet'] != '']
tweets = tweets.reset_index(drop=True)

states = pd.read_csv('data/elected_officials.csv')

states = states.melt(id_vars = ['State',
                                'StateAbbr',
                                'Name',
                                'Party',
                                'Inauguration',
                                'Title',
                                'office'],
                    value_vars = ['officialTwitter',
                                  'campaignTwitter',
                                  'othertwitter'],
                    var_name = 'account_type',
                    value_name = 'twitter')

states['twitter'] = states['twitter'].str.lower()

tweets = tweets.merge(states, left_on = 'username', right_on = 'twitter')

#Create numeric labels based on state names

#Merge labels into MTG data frame
labels = pd.DataFrame(tweets['office'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['label', 'office']
tweets = tweets.merge(labels, on = 'office')

#Select labels as targets
y = tweets['label']

#Select text columns as features
X = tweets["tweet"]

#Training test split 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)

#Preprocess text 
vectorizer = TfidfVectorizer(
    min_df=docs['smallest'],
    max_df=docs['largest'],
    stop_words="english",
    ngram_range = (ngrams['min'],ngrams['max'])
)

#Create pipeline with preprocessing and linear SVC
pipe = Pipeline([
    ('preprocess', vectorizer),
    ('LinearSVC', LinearSVC())
])

#Fit pipe to training data
fitted_pipe = pipe.fit(X_train, y_train)

#Export pickeled pipe
joblib.dump(fitted_pipe, 'outputs/mc_office_pipe.pkl')

#Generate predictions
y_pred = pipe.predict(X_test)

#Output metrics to JSON
metrics = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
metrics["weighted avg"].to_json("metrics/mc_office_metrics.json")