Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

stage_01_prepare.py 2.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  1. import argparse
  2. import os
  3. import shutil
  4. from tqdm import tqdm
  5. import logging
  6. from src.utils.all_utils import read_yaml, create_directory
  7. from src.utils.data_management import process_posts
  8. import random
  9. logging_str = "[%(asctime)s: %(levelname)s: %(module)s]: %(message)s"
  10. log_dir = "logs"
  11. os.makedirs(log_dir, exist_ok=True)
  12. logging.basicConfig(filename=os.path.join(log_dir, 'running_logs.log'), level=logging.INFO, format=logging_str,
  13. filemode="a")
  14. def main(config_path, params_path):
  15. """
  16. Main function for stage_01_prepare.py to convert xml to tsv
  17. :param config_path: path to config file
  18. :config_path type: str
  19. """
  20. config = read_yaml(config_path)
  21. params = read_yaml(params_path)
  22. source_data = config['source_data']
  23. input_data = os.path.join(source_data['data_dir'], source_data['data_file'])
  24. split = params['prepare']['split']
  25. seed = params['prepare']['seed']
  26. random.seed(seed)
  27. artifacts = config['artifacts']
  28. prepare_data_dir_path = os.path.join(artifacts['ARTIFACTS_DIR'], artifacts['PREPARED_DATA'])
  29. create_directory([prepare_data_dir_path])
  30. train_data_path = os.path.join(prepare_data_dir_path, artifacts['TRAIN_DATA'])
  31. test_data_path = os.path.join(prepare_data_dir_path, artifacts['TEST_DATA'])
  32. with open(input_data, encoding="utf8") as fd_in:
  33. with open(train_data_path, "w", encoding="utf8") as fd_out_train:
  34. with open(test_data_path, "w",encoding="utf8") as fd_out_test:
  35. process_posts(fd_in, fd_out_train, fd_out_test, "<python>",split)
  36. if __name__ == '__main__':
  37. args = argparse.ArgumentParser()
  38. args.add_argument("--config", "-c", default="config/config.yaml")
  39. args.add_argument("--params", "-p", default="params.yaml")
  40. parsed_args = args.parse_args()
  41. try:
  42. logging.info("\n********************")
  43. logging.info(">>>>> stage one started <<<<<")
  44. main(config_path=parsed_args.config, params_path=parsed_args.params)
  45. logging.info(">>>>> stage one completed! all the data are saved in local <<<<<n")
  46. except Exception as e:
  47. logging.exception(e)
  48. raise e
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...