Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

topic_model.py 3.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  1. # ---
  2. # jupyter:
  3. # jupytext:
  4. # formats: py:light
  5. # text_representation:
  6. # extension: .py
  7. # format_name: light
  8. # format_version: '1.5'
  9. # jupytext_version: 1.13.6
  10. # kernelspec:
  11. # display_name: Python [conda env:unhcr_speeches]
  12. # language: python
  13. # name: conda-env-unhcr_speeches-py
  14. # ---
  15. import pandas as pd
  16. import numpy as np
  17. import nltk
  18. from nltk.corpus import wordnet as wn
  19. from nltk.stem import WordNetLemmatizer
  20. import yaml
  21. import janitor as pj
  22. from sklearn.feature_extraction.text import TfidfVectorizer
  23. from bertopic import BERTopic
  24. from hdbscan import HDBSCAN
  25. import gensim
  26. from gensim import models
  27. from gensim.corpora import Dictionary, MmCorpus
  28. from gensim.models import Phrases, LdaModel
  29. from gensim.models.doc2vec import TaggedDocument
  30. from gensim.test.utils import datapath
  31. import pyLDAvis
  32. import pyLDAvis.gensim_models as gensimvis
  33. import os
  34. import logging
  35. import pickle
  36. logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
  37. level=logging.INFO)
  38. df = pd.read_feather('data/cleaned_speeches')
  39. # +
  40. with open('params.yaml', 'r') as fd:
  41. params = yaml.safe_load(fd)
  42. d_f = params['preprocessing']['df']
  43. stopwords = params['preprocessing']['stopwords'] + nltk.corpus.stopwords.words('english') +nltk.corpus.stopwords.words('spanish') + nltk.corpus.stopwords.words('french')
  44. punctuation = params['preprocessing']['punctuation']
  45. passes = params['lda']['passes']
  46. iterations = params['lda']['iterations']
  47. num_topics = params['lda']['num_topics']
  48. # +
  49. # Tokenize
  50. df['speech'] = df.speech.apply(nltk.tokenize.word_tokenize)
  51. # -
  52. df = df.explode('speech').reset_index()
  53. # +
  54. # Lemmatize
  55. wnl = WordNetLemmatizer()
  56. df['speech'] = ' '.join([wnl.lemmatize(w) for w in df.speech]).split()
  57. # +
  58. # Remove stopwords and punctuation
  59. df = df.filter_column_isin('speech',
  60. stopwords,
  61. complement = True)
  62. df = df.filter_column_isin('speech',
  63. punctuation,
  64. complement = True)
  65. # -
  66. df = df.groupby(['id', 'speaker', 'date', 'title', 'decade'])['speech'].apply(' '.join).reset_index()
  67. # +
  68. # Re-tokenize
  69. df['speech'] = df.speech.apply(nltk.tokenize.word_tokenize)
  70. # +
  71. # Make the text for each document a list of tokens for bigrams/LDA
  72. docs_tagged = (
  73. df
  74. .apply(lambda row: TaggedDocument(row.speech, [row.id]), axis = 1)
  75. .tolist()
  76. )
  77. # +
  78. # Clean off the tags because they confuse the bigrams
  79. docs = pd.DataFrame(docs_tagged)
  80. docs = docs['words'].tolist()
  81. # +
  82. bigram = Phrases(docs, min_count = 20)
  83. for idx in range(len(docs)):
  84. for token in bigram[docs[idx]]:
  85. if '_' in token:
  86. docs[idx].append(token)
  87. # +
  88. # Dictionary and corpus function
  89. def prep_corpus(docs, no_below=d_f['min'], no_above=d_f['max']):
  90. print('Building dictionary...')
  91. dictionary = Dictionary(docs)
  92. # print(dictionary)
  93. stopword_ids = map(dictionary.token2id.get, stopwords)
  94. # print(stopword_ids)
  95. dictionary.filter_tokens(stopword_ids)
  96. # print(dictionary)
  97. dictionary.compactify()
  98. # print(dictionary)
  99. dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
  100. print(dictionary)
  101. dictionary.compactify()
  102. print('Building corpus...')
  103. corpus = [dictionary.doc2bow(doc) for doc in docs]
  104. return dictionary, corpus
  105. ### https://github.com/XuanX111/Friends_text_generator/blob/master/Friends_LDAvis_Xuan_Qi.ipynb
  106. # -
  107. dictionary, corpus = prep_corpus(docs)
  108. MmCorpus.serialize('speech.mm', corpus)
  109. dictionary.save('speech.dict')
  110. lda_model = LdaModel(corpus=corpus,
  111. num_topics = num_topics,
  112. eval_every = 1,
  113. passes = passes,
  114. iterations = iterations,
  115. id2word=dictionary,
  116. random_state=np.random.RandomState(42))
  117. lda_model.save('lda_model')
  118. with open('data/docs', "wb") as fp: #Pickling
  119. pickle.dump(docs, fp)
  120. with open('data/docs_tagged', 'wb') as fp:
  121. pickle.dump(docs_tagged, fp)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...