hsuehkuan-lu
/
sentiment-analysis


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
            import re
import string
import yaml
from collections import Counter, OrderedDict
import pandas as pd
import torchtext
from torchtext.data.utils import get_tokenizer

# Removing all punctuations from Text
mapping = {"ain't": "is not", "aren't": "are not", "can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
PUNCT_TO_REMOVE = string.punctuation


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))


def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text


def word_replace(text):
    return text.replace('<br />', '')


def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)


def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)


def preprocess_text(text, remove_punc=True):
    text = clean_contractions(text, mapping)
    text = text.lower()
    text = word_replace(text)
    text = remove_urls(text)
    text = remove_html(text)
    if remove_punc:
        text = remove_punctuation(text)
    return text


with open('params.yaml', 'r') as f:
    PARAMS = yaml.safe_load(f)


tokenizer = get_tokenizer('basic_english')


def generate_vocabulary():
    counter = Counter()
    df = pd.read_csv('data/all.csv')
    for line in df[PARAMS['feature']]:
        counter.update([''.join(list(filter(lambda x: x.isalpha(), [ch for ch in word])))
                        for word in tokenizer(preprocess_text(line))])
    del counter['']
    num_classes = len(set([label for label in df[PARAMS['label']]]))
    sorted_by_freq_tuples = counter.most_common(PARAMS['basic']['vocab_size'])
    specials = (PARAMS['unk_token'], PARAMS['pad_token'], PARAMS['sos_token'], PARAMS['eos_token'])
    vocab = torchtext.vocab.vocab(OrderedDict(
        [(tok, 1) for tok in specials] + sorted_by_freq_tuples
    ))
    vocab.set_default_index(0)
    config = {
        'vocab_size': len(vocab),
        'num_classes': num_classes,
        'padding_idx': vocab[PARAMS['pad_token']],
        'sos_idx': vocab[PARAMS['sos_token']],
        'eos_idx': vocab[PARAMS['eos_token']]
    }
    return vocab, config


class Preprocessor(object):
    def __init__(self, vocab):
        super(Preprocessor, self).__init__()
        self._vocab = vocab
        self._tokenizer = tokenizer

    def __len__(self):
        return len(self._vocab)

    def text_pipeline(self, text):
        if isinstance(text, list):
            return [[self._vocab[''.join(list(filter(lambda x: x.isalpha(), i)))] for i in tokenizer(preprocess_text(t))] for t in text]
        return [self._vocab[''.join(list(filter(lambda x: x.isalpha(), i)))] for i in tokenizer(preprocess_text(text))]

    def label_pipeline(self, label):
        return label

    @property
    def vocab(self):
        return self._vocab