Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

featurization.py 2.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
  1. import os
  2. import pickle
  3. import sys
  4. import numpy as np
  5. import pandas as pd
  6. import scipy.sparse as sparse
  7. import yaml
  8. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  9. params = yaml.safe_load(open("params.yaml"))["featurize"]
  10. np.set_printoptions(suppress=True)
  11. if len(sys.argv) != 3 and len(sys.argv) != 5:
  12. sys.stderr.write("Arguments error. Usage:\n")
  13. sys.stderr.write("\tpython featurization.py data-dir-path features-dir-path\n")
  14. sys.exit(1)
  15. train_input = os.path.join(sys.argv[1], "train.tsv")
  16. test_input = os.path.join(sys.argv[1], "test.tsv")
  17. train_output = os.path.join(sys.argv[2], "train.pkl")
  18. test_output = os.path.join(sys.argv[2], "test.pkl")
  19. max_features = params["max_features"]
  20. ngrams = params["ngrams"]
  21. def get_df(data):
  22. df = pd.read_csv(
  23. data,
  24. encoding="utf-8",
  25. header=None,
  26. delimiter="\t",
  27. names=["id", "label", "text"],
  28. )
  29. sys.stderr.write(f"The input data frame {data} size is {df.shape}\n")
  30. return df
  31. def save_matrix(df, matrix, output):
  32. id_matrix = sparse.csr_matrix(df.id.astype(np.int64)).T
  33. label_matrix = sparse.csr_matrix(df.label.astype(np.int64)).T
  34. result = sparse.hstack([id_matrix, label_matrix, matrix], format="csr")
  35. msg = "The output matrix {} size is {} and data type is {}\n"
  36. sys.stderr.write(msg.format(output, result.shape, result.dtype))
  37. with open(output, "wb") as fd:
  38. pickle.dump(result, fd)
  39. pass
  40. os.makedirs(sys.argv[2], exist_ok=True)
  41. # Generate train feature matrix
  42. df_train = get_df(train_input)
  43. train_words = np.array(df_train.text.str.lower().values.astype("U"))
  44. bag_of_words = CountVectorizer(
  45. stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
  46. )
  47. bag_of_words.fit(train_words)
  48. train_words_binary_matrix = bag_of_words.transform(train_words)
  49. tfidf = TfidfTransformer(smooth_idf=False)
  50. tfidf.fit(train_words_binary_matrix)
  51. train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
  52. save_matrix(df_train, train_words_tfidf_matrix, train_output)
  53. # Generate test feature matrix
  54. df_test = get_df(test_input)
  55. test_words = np.array(df_test.text.str.lower().values.astype("U"))
  56. test_words_binary_matrix = bag_of_words.transform(test_words)
  57. test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
  58. save_matrix(df_test, test_words_tfidf_matrix, test_output)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...