Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 5.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  1. import re
  2. import string
  3. import yaml
  4. from collections import Counter, OrderedDict
  5. import pandas as pd
  6. import torchtext
  7. from torchtext.data.utils import get_tokenizer
  8. # Removing all punctuations from Text
  9. mapping = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
  10. PUNCT_TO_REMOVE = string.punctuation
  11. def remove_punctuation(text):
  12. return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
  13. def clean_contractions(text, mapping):
  14. specials = ["’", "‘", "´", "`"]
  15. for s in specials:
  16. text = text.replace(s, "'")
  17. text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
  18. return text
  19. def word_replace(text):
  20. return text.replace('<br />', '')
  21. def remove_urls(text):
  22. url_pattern = re.compile(r'https?://\S+|www\.\S+')
  23. return url_pattern.sub(r'', text)
  24. def remove_html(text):
  25. html_pattern = re.compile('<.*?>')
  26. return html_pattern.sub(r'', text)
  27. def preprocess_text(text, remove_punc=True):
  28. text = clean_contractions(text, mapping)
  29. text = text.lower()
  30. text = word_replace(text)
  31. text = remove_urls(text)
  32. text = remove_html(text)
  33. if remove_punc:
  34. text = remove_punctuation(text)
  35. return text
  36. with open('params.yaml', 'r') as f:
  37. PARAMS = yaml.safe_load(f)
  38. tokenizer = get_tokenizer('basic_english')
  39. def generate_vocabulary():
  40. counter = Counter()
  41. df = pd.read_csv('data/all.csv')
  42. for line in df[PARAMS['feature']]:
  43. counter.update([''.join(list(filter(lambda x: x.isalpha(), [ch for ch in word])))
  44. for word in tokenizer(preprocess_text(line))])
  45. del counter['']
  46. num_classes = len(set([label for label in df[PARAMS['label']]]))
  47. sorted_by_freq_tuples = counter.most_common(PARAMS['basic']['vocab_size'])
  48. specials = (PARAMS['unk_token'], PARAMS['pad_token'], PARAMS['sos_token'], PARAMS['eos_token'])
  49. vocab = torchtext.vocab.vocab(OrderedDict(
  50. [(tok, 1) for tok in specials] + sorted_by_freq_tuples
  51. ))
  52. vocab.set_default_index(0)
  53. config = {
  54. 'vocab_size': len(vocab),
  55. 'num_classes': num_classes,
  56. 'padding_idx': vocab[PARAMS['pad_token']],
  57. 'sos_idx': vocab[PARAMS['sos_token']],
  58. 'eos_idx': vocab[PARAMS['eos_token']]
  59. }
  60. return vocab, config
  61. class Preprocessor(object):
  62. def __init__(self, vocab):
  63. super(Preprocessor, self).__init__()
  64. self._vocab = vocab
  65. self._tokenizer = tokenizer
  66. def __len__(self):
  67. return len(self._vocab)
  68. def text_pipeline(self, text):
  69. if isinstance(text, list):
  70. return [[self._vocab[''.join(list(filter(lambda x: x.isalpha(), i)))] for i in tokenizer(preprocess_text(t))] for t in text]
  71. return [self._vocab[''.join(list(filter(lambda x: x.isalpha(), i)))] for i in tokenizer(preprocess_text(text))]
  72. def label_pipeline(self, label):
  73. return label
  74. @property
  75. def vocab(self):
  76. return self._vocab
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...