Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare_dataset.py 3.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
  1. # -*- coding: utf-8 -*-
  2. # ---
  3. # jupyter:
  4. # jupytext:
  5. # formats: py:light
  6. # text_representation:
  7. # extension: .py
  8. # format_name: light
  9. # format_version: '1.5'
  10. # jupytext_version: 1.13.6
  11. # kernelspec:
  12. # display_name: Python [conda env:text-data-class]
  13. # language: python
  14. # name: conda-env-text-data-class-py
  15. # ---
  16. import pandas as pd
  17. import re
  18. import nltk
  19. from nltk.corpus import wordnet as wn
  20. from nltk.stem import WordNetLemmatizer
  21. import yaml
  22. import janitor as pj
  23. import matplotlib.pyplot as plt
  24. import numpy as np
  25. df = (pd.read_json("data/speeches.json"))
  26. df = df[['author', 'content', 'id']]
  27. # +
  28. ## Bring in parameters
  29. with open("params.yaml", "r") as fd:
  30. params = yaml.safe_load(fd)
  31. punctuation = params['preprocessing']['punctuation']
  32. stopwords = params['preprocessing']['stopwords'] + nltk.corpus.stopwords.words('english')
  33. # +
  34. ## Extract titles from content - text up until the first date is mentioned.
  35. df['Title'] = df['content'].str.extract(r"\n+([\s\S]+), \d+\s\w+\s\d+")
  36. # +
  37. ### Extract speeches and dates from content
  38. ## Speeches follow "Statements by the High Commissioner" and the date
  39. date_text = (df['content'].
  40. str.extract(r"Statements by High Commissioner,\s*(\d+\s\w+\s\d+)[\s\n\r]+([\s\S]+)")
  41. )
  42. date_text.columns = ['date', 'speech']
  43. # -
  44. date_text
  45. # +
  46. ## Convert string dates to datetime
  47. df['date'] = pd.to_datetime(date_text.date)
  48. ## Replace line breaks, double spaces with spaces
  49. df['speech'] = date_text.speech.replace(to_replace = ['\n', ' '], value = ' ', regex = True)
  50. ## Lower case
  51. df['speech'] = df.speech.str.lower()
  52. ## Remove dates and pesky punctuation
  53. df['speech'] = df.speech.replace(to_replace = [r'(\d+\s\w+\s\d+)', r'([\'\"–]+)'], value = '', regex = True)
  54. # +
  55. ## Superficial capitalization switches for my own satisfaction
  56. df['author'] = (df.author.replace
  57. (to_replace = ['ogata',
  58. 'guterres',
  59. 'lubbers',
  60. 'khan',
  61. 'hartling',
  62. 'schnyder',
  63. 'lindt',
  64. 'hocké',
  65. 'stoltenberg'],
  66. value = ['Ogata',
  67. 'Guterres',
  68. 'Lubbers',
  69. 'Khan',
  70. 'Hartling',
  71. 'Schnyder',
  72. 'Lindt',
  73. 'Hocké',
  74. 'Stoltenberg'])
  75. )
  76. df = df.rename(columns={'Title' : 'title', 'author' : 'speaker'})
  77. # +
  78. # Attempt to filter out Spanish and French (drops 10 speeches)
  79. df = df[df["speech"].str.contains("acnur | réfugiés")==False].reset_index()
  80. # -
  81. ## Add decade
  82. df['decade'] = (df.date.dt.year//10)*10
  83. # +
  84. ## Tokenize and explode
  85. df['speech'] = df.speech.apply(nltk.tokenize.word_tokenize)
  86. df = df.explode('speech').reset_index()
  87. # +
  88. ## Lemmatize! We don't need tense, focus is on the topics we're covering,
  89. ## and this way we'll reduce the likliehood that we're conflating meanings.
  90. wnl = WordNetLemmatizer()
  91. df['speech'] = ' '.join([wnl.lemmatize(w) for w in df.speech]).split()
  92. # +
  93. ## Remove stopwords and punctuation
  94. df = df.filter_column_isin('speech',
  95. stopwords,
  96. complement = True)
  97. df = df.filter_column_isin('speech',
  98. punctuation,
  99. complement = True)
  100. df.speech.value_counts().head(30)
  101. # +
  102. df = df[['id', 'speaker', 'date', 'title', 'speech', 'decade']].reset_index(drop = True)
  103. df.to_feather(r'data/cleaned_speeches')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...