Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

topics_over_time.py 3.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
  1. # ---
  2. # jupyter:
  3. # jupytext:
  4. # formats: py:light
  5. # text_representation:
  6. # extension: .py
  7. # format_name: light
  8. # format_version: '1.5'
  9. # jupytext_version: 1.13.6
  10. # kernelspec:
  11. # display_name: Python [conda env:unhcr_speeches]
  12. # language: python
  13. # name: conda-env-unhcr_speeches-py
  14. # ---
  15. import pandas as pd
  16. import re
  17. import nltk
  18. from nltk.corpus import wordnet as wn
  19. from nltk.stem import WordNetLemmatizer
  20. import yaml
  21. import janitor as pj
  22. import matplotlib.pyplot as plt
  23. import numpy as np
  24. import gensim
  25. from gensim import corpora, models
  26. from gensim.corpora import Dictionary, MmCorpus
  27. from gensim.models import Phrases, LdaModel
  28. from gensim.test.utils import datapath
  29. import pyLDAvis
  30. import pyLDAvis.gensim_models as gensimvis
  31. import pickle
  32. # +
  33. # Pull in model
  34. topic_model = gensim.models.ldamodel.LdaModel.load('lda_model')
  35. dictionary = corpora.Dictionary.load('speech.dict')
  36. corpus = corpora.MmCorpus('speech.mm')
  37. with open('data/docs_tagged', 'rb') as fp:
  38. docs_tagged = pickle.load(fp)
  39. # +
  40. # Generate dataframe of all document topics
  41. topics = pd.DataFrame()
  42. topics['topics'] = topic_model.get_document_topics(corpus)
  43. sf = pd.DataFrame(data = topics.topics)
  44. af = pd.DataFrame()
  45. for i in range(10):
  46. af[str(i)]=[]
  47. frames = [sf, af]
  48. af = pd.concat(frames).fillna(0)
  49. for i in range(693):
  50. for j in range(len(topics['topics'][i])):
  51. af[str(topics['topics'][i][j][0])].loc[i] = topics['topics'][i][j][1]
  52. af = af.reset_index()
  53. ## will merge on index - documents are in the same order as our tagged docs dataset,
  54. ## which we can use the tags in to merge to the original dataset with date information
  55. ### https://stackoverflow.com/questions/66403628/how-to-change-topic-list-from-gensim-lda-get-document-topics-to-a-dataframe
  56. # +
  57. # Pull in tagged documents to merge tags to the document topic dataset
  58. docs_tagged = pd.DataFrame(docs_tagged).explode('tags').reset_index()
  59. df_topics = (
  60. af.merge(docs_tagged, on = 'index')
  61. [['tags', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']]
  62. ).reset_index()
  63. # +
  64. # Pull in original dataset with date information
  65. df_merge = (
  66. pd.read_feather('data/cleaned_speeches')
  67. [['id', 'date']]
  68. .drop_duplicates()
  69. .reset_index().reset_index()
  70. [['id', 'date']]
  71. )
  72. # +
  73. # Merge topics dataframe to original dataframe
  74. df = (df_merge.merge(df_topics, left_on = 'id', right_on = 'tags')
  75. [['date', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']])
  76. # Add a year variable
  77. df['year'] = df['date'].dt.to_period('Y')
  78. # +
  79. # Rename and filter columns
  80. df.columns = ['date',
  81. 'topic 1',
  82. 'topic 2',
  83. 'topic 3',
  84. 'topic 4',
  85. 'topic 5',
  86. 'topic 6',
  87. 'topic 7',
  88. 'topic 8',
  89. 'topic 9',
  90. 'topic 10',
  91. 'year']
  92. df = df[['date',
  93. 'year',
  94. 'topic 1',
  95. 'topic 2',
  96. 'topic 3',
  97. 'topic 4',
  98. 'topic 5',
  99. 'topic 6',
  100. 'topic 7',
  101. 'topic 8',
  102. 'topic 9',
  103. 'topic 10']]
  104. # +
  105. # Prepare count variable and calculate relative topic frequency
  106. df['count'] = 1
  107. # +
  108. # Sum topics by year
  109. sums = (df
  110. .groupby('year')
  111. [['count', 'topic 1','topic 2','topic 3','topic 4','topic 5'
  112. ,'topic 6','topic 7','topic 8','topic 9','topic 10']]
  113. .sum()
  114. )
  115. # +
  116. # Divide topics by year by documents per year
  117. topics_over_time = (sums[['topic 1','topic 2','topic 3','topic 4','topic 5',
  118. 'topic 6','topic 7','topic 8','topic 9','topic 10']]
  119. .div(sums['count'],
  120. axis='index')
  121. ).reset_index()
  122. # -
  123. topics_over_time
  124. # +
  125. # Save
  126. topics_over_time.to_feather(r'data/topics_over_time')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...