Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurization.py 3.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
  1. """
  2. Transform dataset to feature set.
  3. Routine Listings
  4. ----------------
  5. get_params()
  6. Get the DVC stage parameters.
  7. featurize(train_input, test_input, train_output, test_output)
  8. Transform data to features.
  9. """
  10. import sys
  11. import dask
  12. import dask.distributed
  13. import numpy as np
  14. import pandas as pd
  15. import scipy.sparse as sparse
  16. from sklearn.feature_extraction.text import CountVectorizer
  17. from sklearn.feature_extraction.text import TfidfTransformer
  18. import pickle
  19. import conf
  20. def get_params():
  21. """Get the DVC stage parameters."""
  22. return {
  23. 'max_features': 5000
  24. }
  25. @dask.delayed
  26. def featurize(train_input, test_input, train_output, test_output,
  27. max_features):
  28. """Transform data to features."""
  29. def get_df(input):
  30. """Load dataset from a CSV file."""
  31. df = pd.read_csv(
  32. input,
  33. encoding='utf-8',
  34. header=None,
  35. delimiter='\t',
  36. names=['id', 'label', 'text']
  37. )
  38. sys.stderr.write('The input data frame {} size is {}\n'.format(
  39. input, df.shape))
  40. return df
  41. def save_matrix(df, matrix, output):
  42. """Save feature matrix."""
  43. id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
  44. label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
  45. result = sparse.hstack([id_matrix, label_matrix, matrix], format='csr')
  46. msg = 'The output matrix {} size is {} and data type is {}\n'
  47. sys.stderr.write(msg.format(output, result.shape, result.dtype))
  48. with open(output, 'wb') as fd:
  49. pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
  50. pass
  51. df_train = get_df(train_input)
  52. train_words = np.array(df_train.text.str.lower().values.astype('U'))
  53. bag_of_words = CountVectorizer(
  54. stop_words='english', max_features=max_features)
  55. bag_of_words.fit(train_words)
  56. train_words_binary_matrix = bag_of_words.transform(train_words)
  57. tfidf = TfidfTransformer(smooth_idf=False)
  58. tfidf.fit(train_words_binary_matrix)
  59. train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
  60. save_matrix(df_train, train_words_tfidf_matrix, train_output)
  61. del df_train
  62. df_test = get_df(test_input)
  63. test_words = np.array(df_test.text.str.lower().values.astype('U'))
  64. test_words_binary_matrix = bag_of_words.transform(test_words)
  65. test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
  66. save_matrix(df_test, test_words_tfidf_matrix, test_output)
  67. if __name__ == '__main__':
  68. client = dask.distributed.Client('localhost:8786')
  69. np.set_printoptions(suppress=True)
  70. INPUT_TRAIN_TSV_PATH = conf.data_dir/'split_train_test'/'Posts-train.tsv'
  71. INPUT_TEST_TSV_PATH = conf.data_dir/'split_train_test'/'Posts-test.tsv'
  72. dvc_stage_name = __file__.strip('.py')
  73. STAGE_OUTPUT_PATH = conf.data_dir/dvc_stage_name
  74. conf.remote_mkdir(STAGE_OUTPUT_PATH).compute()
  75. OUTPUT_TRAIN_MATRIX_PATH = STAGE_OUTPUT_PATH/'matrix-train.p'
  76. OUTPUT_TEST_MATRIX_PATH = STAGE_OUTPUT_PATH/'matrix-test.p'
  77. config = get_params()
  78. MAX_FEATUERS = config['max_features']
  79. featurize(
  80. INPUT_TRAIN_TSV_PATH, INPUT_TEST_TSV_PATH,
  81. OUTPUT_TRAIN_MATRIX_PATH, OUTPUT_TEST_MATRIX_PATH,
  82. MAX_FEATUERS).compute()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...