Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

stage_03_featurization.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
  1. import argparse
  2. import os
  3. import logging
  4. from src.utils.common import read_yaml, create_directories ,get_df
  5. from src.utils.featurize import save_matrix
  6. import numpy as np
  7. from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
  8. STAGE = "Stage 03 featurization"
  9. logging.basicConfig(
  10. filename=os.path.join("logs", 'running_logs.log'),
  11. level=logging.INFO,
  12. format="[%(asctime)s: %(levelname)s: %(module)s]: %(message)s",
  13. filemode="a"
  14. )
  15. def main(config_path, params_path):
  16. ## read config files
  17. config = read_yaml(config_path)
  18. params = read_yaml(params_path)
  19. artifacts = config["artifacts"]
  20. prepared_data_dir_path = os.path.join(artifacts["ARTIFACTS_DIR"], artifacts["PREPARED_DATA"])
  21. train_data_path = os.path.join(prepared_data_dir_path, artifacts["TRAIN_DATA"])
  22. test_data_path = os.path.join(prepared_data_dir_path, artifacts["TEST_DATA"])
  23. featurized_data_dir_path = os.path.join(artifacts["ARTIFACTS_DIR"], artifacts["FEATURIZED_DATA"])
  24. create_directories([featurized_data_dir_path])
  25. featurized_train_data_path = os.path.join(featurized_data_dir_path, artifacts["FEATURIZED_TRAIN_DATA"])
  26. featurized_test_data_path = os.path.join(featurized_data_dir_path, artifacts["FEATURIZED_TEST_DATA"])
  27. df_train = get_df(train_data_path)
  28. train_words = np.array(df_train.text.str.lower().values.astype("U")) ## << U1000
  29. max_features = params["featurize"]["max_features"]
  30. ngrams = params["featurize"]["ngrams"]
  31. bag_of_words = CountVectorizer(
  32. stop_words="english", max_features=max_features, ngram_range=(1, ngrams)
  33. )
  34. bag_of_words.fit(train_words)
  35. train_words_binary_matrix = bag_of_words.transform(train_words)
  36. tfidf = TfidfTransformer(smooth_idf=False)
  37. tfidf.fit(train_words_binary_matrix)
  38. train_words_tfidf_matrix = tfidf.transform(train_words_binary_matrix)
  39. save_matrix(df_train, train_words_tfidf_matrix, featurized_train_data_path)
  40. df_test = get_df(test_data_path)
  41. test_words = np.array(df_test.text.str.lower().values.astype("U")) ## << U1000
  42. test_words_binary_matrix = bag_of_words.transform(test_words)
  43. test_words_tfidf_matrix = tfidf.transform(test_words_binary_matrix)
  44. save_matrix(df_test, test_words_tfidf_matrix, featurized_test_data_path)
  45. if __name__ == '__main__':
  46. args = argparse.ArgumentParser()
  47. args.add_argument("--config", "-c", default="configs/config.yaml")
  48. args.add_argument("--params", "-p", default="params.yaml")
  49. parsed_args = args.parse_args()
  50. try:
  51. logging.info("\n********************")
  52. logging.info(f">>>>> stage {STAGE} started <<<<<")
  53. main(config_path=parsed_args.config, params_path=parsed_args.params)
  54. logging.info(f">>>>> stage {STAGE} completed!<<<<<\n")
  55. except Exception as e:
  56. logging.exception(e)
  57. raise e
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...