ShoukanLabs
/
AuxiliaryASR


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
            #coding:utf-8
import os
import os.path as osp
import pandas as pd

DEFAULT_DICT_PATH = osp.join('word_index_dict.csv')
class TextCleaner:
    def __init__(self, word_index_dict_path=DEFAULT_DICT_PATH):
        self.word_index_dictionary = self.load_dictionary(word_index_dict_path)

    def __call__(self, text):
        indexes = []
        for char in text:
            try:
                indexes.append(self.word_index_dictionary[char])
            except KeyError:
                print(char)
        return indexes

    def load_dictionary(self, path):
        csv = pd.read_csv(path, header=None, encoding="utf8").values
        word_index_dict = {word: index for word, index in csv}
        return word_index_dict


if __name__ == "__main__":
    test = TextCleaner()
    out = test("ɪn fækt, hiː hæd lʊkt æt twɛnti vɛri mʌtʃ æz hiː lʊkt æt sɪksti, lækɪŋ ə lɪtəl ʌv ðə greɪnəs.$")
    diction = test.word_index_dictionary
    words = []
    key_list = list(diction.keys())
    val_list = list(diction.values())
    for index in out:
        position = val_list.index(index)
        words.append(key_list[position])

    fin = "".join(words)
    print(out)
    print(fin)