1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
- #!/usr/bin/env python
- # coding: utf-8
- # %%
- import joblib
- import numpy as np
- import pandas as pd
- import re
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.metrics import classification_report
- from sklearn.model_selection import train_test_split
- from sklearn.pipeline import Pipeline
- from sklearn.svm import LinearSVC
- import yaml
- with open("params.yaml", "r") as fd:
- params = yaml.safe_load(fd)
- docs = params["preprocessing"]["max_min_docs"]
- ngrams = params['preprocessing']['n_grams']
-
- #Load in data
- tweets = pd.read_csv('data/tweets.csv')
- #Drop tweets not in english
- tweets = tweets.loc[tweets['language'] == 'en']
- tweets['tweet'] = tweets['tweet'].str.replace(r'http\S+', '')
- tweets = tweets.loc[tweets['tweet'] != '']
- tweets = tweets.reset_index(drop=True)
- states = pd.read_csv('data/elected_officials.csv')
- states = states.melt(id_vars = ['State',
- 'StateAbbr',
- 'Name',
- 'Party',
- 'Inauguration',
- 'Title',
- 'office'],
- value_vars = ['officialTwitter',
- 'campaignTwitter',
- 'othertwitter'],
- var_name = 'account_type',
- value_name = 'twitter')
- states['twitter'] = states['twitter'].str.lower()
- tweets = tweets.merge(states, left_on = 'username', right_on = 'twitter')
- #Create numeric labels based on state names
- #Merge labels into MTG data frame
- labels = pd.DataFrame(tweets['State'].unique()).reset_index()
- #Add one because zero indexed
- labels['index'] = labels['index']+1
- labels.columns = ['label', 'State']
- tweets = tweets.merge(labels, on = 'State')
- #Select labels as targets
- y = tweets['label']
- #Select text columns as features
- X = tweets["tweet"]
- #Training test split 70/30
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5)
- #Preprocess text
- vectorizer = TfidfVectorizer(
- min_df=docs['smallest'],
- max_df=docs['largest'],
- stop_words="english",
- ngram_range = (ngrams['min'],ngrams['max'])
- )
- #Create pipeline with preprocessing and linear SVC
- pipe = Pipeline([
- ('preprocess', vectorizer),
- ('LinearSVC', LinearSVC())
- ])
- #Fit pipe to training data
- fitted_pipe = pipe.fit(X_train, y_train)
- #Export pickeled pipe
- joblib.dump(fitted_pipe, 'outputs/mc_state_pipe.pkl')
- #Generate predictions
- y_pred = pipe.predict(X_test)
- #Output metrics to JSON
- metrics = pd.DataFrame(classification_report(y_test, y_pred, output_dict=True))
- metrics["weighted avg"].to_json("metrics/mc_state_metrics.json")
|