Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

text_utils.py 1.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
  1. #coding:utf-8
  2. import os
  3. import os.path as osp
  4. import pandas as pd
  5. DEFAULT_DICT_PATH = osp.join('word_index_dict.csv')
  6. class TextCleaner:
  7. def __init__(self, word_index_dict_path=DEFAULT_DICT_PATH):
  8. self.word_index_dictionary = self.load_dictionary(word_index_dict_path)
  9. def __call__(self, text):
  10. indexes = []
  11. for char in text:
  12. try:
  13. indexes.append(self.word_index_dictionary[char])
  14. except KeyError:
  15. print(char)
  16. return indexes
  17. def load_dictionary(self, path):
  18. csv = pd.read_csv(path, header=None, encoding="utf8").values
  19. word_index_dict = {word: index for word, index in csv}
  20. return word_index_dict
  21. if __name__ == "__main__":
  22. test = TextCleaner()
  23. out = test("ɪn fækt, hiː hæd lʊkt æt twɛnti vɛri mʌtʃ æz hiː lʊkt æt sɪksti, lækɪŋ ə lɪtəl ʌv ðə greɪnəs.$")
  24. diction = test.word_index_dictionary
  25. words = []
  26. key_list = list(diction.keys())
  27. val_list = list(diction.values())
  28. for index in out:
  29. position = val_list.index(index)
  30. words.append(key_list[position])
  31. fin = "".join(words)
  32. print(out)
  33. print(fin)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...