Are you sure you want to delete this access key?
jupyter |
---|
[{jupytext [{formats ipynb,md} {text_representation [{extension .md} {format_name markdown} {format_version 1.3} {jupytext_version 1.13.8}]}]} {kernelspec [{display_name Python 3} {language python} {name python3}]}] |
Task: Topic Modeling
Method: BERTopic
Script: topic_model.py
DVC YAML Stage: topic_model
#!dvc pull
from bertopic import BERTopic
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import re
import seaborn as sn
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import yaml
pd.options.display.max_columns = None
pd.options.display.max_colwidth = None
pd.options.display.max_seq_items = None
tweets = pd.read_csv('data/tweets.csv')
#Drop tweets not in english
tweets = tweets.loc[tweets['language'] == 'en']
tweets['tweet'] = tweets['tweet'].str.replace(r'http\S+', '')
tweets = tweets.loc[tweets['tweet'] != '']
tweets = tweets.reset_index(drop=True)
tweets.shape
tweets.dtypes
tweets.head(5)
topic_model = BERTopic.load('project_BERTopic')
topics_list = topic_model.get_topics()
len(topic_model.get_topics())
#Static image in case real plot doesn't load
from IPython.display import Image
Image(filename='plots/topicmodel_full.png')
topic_model.visualize_topics(topics_list)
probs = topic_model.hdbscan_model.probabilities_
topics = topic_model._map_predictions(topic_model.hdbscan_model.labels_)
new_topics, new_probs = topic_model.reduce_topics(tweets['tweet'], topics, probs, nr_topics = 10)
Image(filename='plots/topicmodel_10.png')
topic_model.visualize_topics()
topic_model.get_topic_info()[1:10]
dynamic_topics = topic_model.topics_over_time(tweets['tweet'],
new_topics,
tweets['date'])
#Static image in case full plot does not load
Image(filename='plots/topics_over_time.png')
topic_model.visualize_topics_over_time(dynamic_topics,
topics=[0,1,2,3,4,5,6,7,8,9],
width = 950)
Observations:
Task: Multiclass Classification (State)
Method: Linear Support Vector Classifier
Number of Classes: 50 (U.S. States)
Script: multiclass_state.py
DVC YAML Stage: multiclass_state
import joblib
import numpy as np
from sklearn.metrics import (confusion_matrix, precision_recall_fscore_support, classification_report)
#Load the trained multiclass pipeline
pipe = joblib.load('outputs/mc_state_pipe.pkl')
#Perform necessary data processing
states = pd.read_csv('data/elected_officials.csv')
states = states.melt(id_vars = ['State',
'StateAbbr',
'Name',
'Party',
'Inauguration',
'Title',
'office'],
value_vars = ['officialTwitter',
'campaignTwitter',
'othertwitter'],
var_name = 'account_type',
value_name = 'twitter')
states['twitter'] = states['twitter'].str.lower()
tweets = tweets.merge(states, left_on = 'username', right_on = 'twitter')
#Create numeric labels based on state names
#Merge labels into MTG data frame
labels = pd.DataFrame(tweets['State'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['state_label', 'State']
tweets = tweets.merge(labels, on = 'State')
#Select labels as targets
y = tweets['state_label']
#Select text columns as features
X = tweets["tweet"]
pipe.fit(X,y)
y_pred = pipe.predict(X)
Rather than print out a 50x50 confusion matrix, I'm going to simplify the matrix to just a few columns:
-state: the abbreviation for the state
-correct: the number of correctly classified tweets for that state
-incorrect: the number of incorrectly classified tweets for that state
-errors: the labels which were applied incorrectly for each state
-precision: true positives/(true positives + false positives)
-recall: true positives/(true positives + false negatives)
-errors: the state labels which were generated as false negatives
cm = confusion_matrix(y,y_pred)
state_cm = pd.DataFrame.from_dict({'state': pd.unique(tweets['StateAbbr']),
'correct': np.diag(cm),
'incorrect': cm.sum(1)-np.diag(cm),
'total_tweets': cm.sum(1),
'precision': np.diag(cm)/cm.sum(0),
'recall': np.diag(cm)/cm.sum(1)})
cm = pd.DataFrame(cm)
cm.columns = pd.unique(tweets['StateAbbr'])
cm.index = pd.unique(tweets['StateAbbr'])
cols = cm.columns.values
mask = cm.gt(0.0).values
np.fill_diagonal(mask, False)
out = [cols[x].tolist() for x in mask]
state_cm['errors'] = out
state_cm
Observations:
Task: Multiclass Classification (Political Office)
Method: Linear Support Vector Classifier
Number of Classes: 5 (Governor, Lieutenant Governor, Attorney General, Secretary of State, Treasurer)
Script: multiclass_office.py
DVC YAML Stage: multiclass_office
#Load the trained multiclass pipeline
pipe = joblib.load('outputs/mc_office_pipe.pkl')
labels = pd.DataFrame(tweets['office'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['office_label', 'office']
tweets = tweets.merge(labels, on = 'office')
#Select labels as targets
y = tweets['office_label']
#Select text columns as features
X = tweets["tweet"]
pipe.fit(X,y)
y_pred = pipe.predict(X)
ConfusionMatrixDisplay.from_predictions(y, y_pred, display_labels = pd.unique(tweets['office']))
#plt.savefig('plots/office_cm.png')
cm = confusion_matrix(y,y_pred)
office_cm = pd.DataFrame.from_dict({'office': pd.unique(tweets['office']),
'correct': np.diag(cm),
'incorrect': cm.sum(1)-np.diag(cm),
'total_tweets': cm.sum(1),
'precision': np.diag(cm)/cm.sum(1),
'recall': np.diag(cm)/cm.sum(0)})
office_cm
Observations:
Task: Binary Classification (Political Party)
Method: Linear Support Vector Classifier
Number of Classes: 2 (Democrat, Republican)
Script: twoclass_party.py
DVC YAML Stage: twoclass_party
Note: 2 officials are Independents, and were excluded from this model. In Minnesota, the Democratic party is called the Democratic Farmer-Labor party (DFL); politicians in that party were recoded as Democrats.
#Load the trained multiclass pipeline
pipe = joblib.load('outputs/bc_party_pipe.pkl')
labels = pd.DataFrame(tweets['Party'].unique()).reset_index()
#Add one because zero indexed
labels['index'] = labels['index']+1
labels.columns = ['party_label', 'Party']
tweets = tweets.merge(labels, on = 'Party')
partyclass = tweets.loc[tweets['Party'] != 'Independent']
#Select labels as targets
y = partyclass['party_label']
#Select text columns as features
X = partyclass["tweet"]
pipe.fit(X,y)
y_pred = pipe.predict(X)
ConfusionMatrixDisplay.from_predictions(y, y_pred, display_labels = pd.unique(partyclass['Party']))
#plt.savefig('plots/party_cm.png')
print(classification_report(y, y_pred, target_names=pd.unique(partyclass['Party'])))
Observations:
Task: Ideal Point Generation
Method: Wordfish (via R packages quanteda
and quanteda.textmodels
)
Output: Value indicating ideological position on left-right scale (further right = more conservative)
Script: ideal_points.R
Note: I was only able to find ideal points for governors and state treasurers. For Lt. Governors, Secretaries of State, and Attorneys General, the algorithm did not converge.
Image(filename='plots/gov_ideal.png')
Image(filename='plots/trs_ideal.png')
Observations:
Conclusions and avenues for further exploration:
Press p or to see the previous file or, n or to see the next file
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?
Are you sure you want to delete this access key?