Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurization.py 2.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
  1. import sys
  2. import dask
  3. import dask.distributed
  4. import numpy as np
  5. import pandas as pd
  6. import scipy.sparse as sparse
  7. from sklearn.feature_extraction.text import CountVectorizer
  8. from sklearn.feature_extraction.text import TfidfTransformer
  9. import pickle
  10. import conf
  11. client = dask.distributed.Client('localhost:8786')
  12. np.set_printoptions(suppress=True)
  13. TRAIN_INPUT = conf.train_tsv
  14. TEST_INPUT = conf.test_tsv
  15. TRAIN_OUTPUT = conf.train_matrix
  16. TEST_OUTPUT = conf.test_matrix
  17. @dask.delayed
  18. def workflow(train_input, test_input, train_output, test_output):
  19. def get_df(input):
  20. df = pd.read_csv(
  21. input,
  22. encoding='utf-8',
  23. header=None,
  24. delimiter='\t',
  25. names=['id', 'label', 'text']
  26. )
  27. sys.stderr.write('The input data frame {} size is {}\n'.format(
  28. input, df.shape))
  29. return df
  30. def save_matrix(df, matrix, output):
  31. id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
  32. label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
  33. result = sparse.hstack([id_matrix, label_matrix, matrix], format='csr')
  34. msg = 'The output matrix {} size is {} and data type is {}\n'
  35. sys.stderr.write(msg.format(output, result.shape, result.dtype))
  36. with open(output, 'wb') as fd:
  37. pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
  38. pass
  39. df_train = get_df(train_input)
  40. train_words = np.array(df_train.text.str.lower().values.astype('U'))
  41. bag_of_words = CountVectorizer(
  42. stop_words='english', max_features=5000)
  43. bag_of_words.fit(train_words)
  44. train_words_binary_matrix = bag_of_words.transform(train_words)
  45. tfidf = TfidfTransformer(smooth_idf=False)
  46. tfidf.fit(train_words_binary_matrix)
  47. train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
  48. save_matrix(df_train, train_words_tfidf_matrix, train_output)
  49. del df_train
  50. df_test = get_df(test_input)
  51. test_words = np.array(df_test.text.str.lower().values.astype('U'))
  52. test_words_binary_matrix = bag_of_words.transform(test_words)
  53. test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
  54. save_matrix(df_test, test_words_tfidf_matrix, test_output)
  55. workflow(TRAIN_INPUT, TEST_INPUT, TRAIN_OUTPUT, TEST_OUTPUT).compute()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...