1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
|
- # -*- coding: utf-8 -*-
- # ---
- # jupyter:
- # jupytext:
- # formats: py:light
- # text_representation:
- # extension: .py
- # format_name: light
- # format_version: '1.5'
- # jupytext_version: 1.13.6
- # kernelspec:
- # display_name: Python [conda env:text-data-class]
- # language: python
- # name: conda-env-text-data-class-py
- # ---
- import pandas as pd
- import re
- import nltk
- from nltk.corpus import wordnet as wn
- from nltk.stem import WordNetLemmatizer
- import yaml
- import janitor as pj
- import matplotlib.pyplot as plt
- import numpy as np
- df = (pd.read_json("data/speeches.json"))
- df = df[['author', 'content', 'id']]
- # +
- ## Bring in parameters
- with open("params.yaml", "r") as fd:
- params = yaml.safe_load(fd)
-
- punctuation = params['preprocessing']['punctuation']
- stopwords = params['preprocessing']['stopwords'] + nltk.corpus.stopwords.words('english')
- # +
- ## Extract titles from content - text up until the first date is mentioned.
- df['Title'] = df['content'].str.extract(r"\n+([\s\S]+), \d+\s\w+\s\d+")
- # +
- ### Extract speeches and dates from content
- ## Speeches follow "Statements by the High Commissioner" and the date
- date_text = (df['content'].
- str.extract(r"Statements by High Commissioner,\s*(\d+\s\w+\s\d+)[\s\n\r]+([\s\S]+)")
- )
- date_text.columns = ['date', 'speech']
- # -
- date_text
- # +
- ## Convert string dates to datetime
- df['date'] = pd.to_datetime(date_text.date)
- ## Replace line breaks, double spaces with spaces
- df['speech'] = date_text.speech.replace(to_replace = ['\n', ' '], value = ' ', regex = True)
- ## Lower case
- df['speech'] = df.speech.str.lower()
- ## Remove dates and pesky punctuation
- df['speech'] = df.speech.replace(to_replace = [r'(\d+\s\w+\s\d+)', r'([\'\"–]+)'], value = '', regex = True)
- # +
- ## Superficial capitalization switches for my own satisfaction
- df['author'] = (df.author.replace
- (to_replace = ['ogata',
- 'guterres',
- 'lubbers',
- 'khan',
- 'hartling',
- 'schnyder',
- 'lindt',
- 'hocké',
- 'stoltenberg'],
- value = ['Ogata',
- 'Guterres',
- 'Lubbers',
- 'Khan',
- 'Hartling',
- 'Schnyder',
- 'Lindt',
- 'Hocké',
- 'Stoltenberg'])
- )
- df = df.rename(columns={'Title' : 'title', 'author' : 'speaker'})
- # +
- # Attempt to filter out Spanish and French (drops 10 speeches)
- df = df[df["speech"].str.contains("acnur | réfugiés")==False].reset_index()
- # -
- ## Add decade
- df['decade'] = (df.date.dt.year//10)*10
- # +
- ## Tokenize and explode
- df['speech'] = df.speech.apply(nltk.tokenize.word_tokenize)
- df = df.explode('speech').reset_index()
- # +
- ## Lemmatize! We don't need tense, focus is on the topics we're covering,
- ## and this way we'll reduce the likliehood that we're conflating meanings.
- wnl = WordNetLemmatizer()
- df['speech'] = ' '.join([wnl.lemmatize(w) for w in df.speech]).split()
- # +
- ## Remove stopwords and punctuation
- df = df.filter_column_isin('speech',
- stopwords,
- complement = True)
- df = df.filter_column_isin('speech',
- punctuation,
- complement = True)
- df.speech.value_counts().head(30)
- # +
- df = df[['id', 'speaker', 'date', 'title', 'speech', 'decade']].reset_index(drop = True)
- df.to_feather(r'data/cleaned_speeches')
|