Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  1. import io
  2. import os
  3. import random
  4. import re
  5. import sys
  6. import xml.etree.ElementTree
  7. import yaml
  8. params = yaml.safe_load(open("params.yaml"))["prepare"]
  9. if len(sys.argv) != 2:
  10. sys.stderr.write("Arguments error. Usage:\n")
  11. sys.stderr.write("\tpython prepare.py data-file\n")
  12. sys.exit(1)
  13. # Test data set split ratio
  14. split = params["split"]
  15. random.seed(params["seed"])
  16. input = sys.argv[1]
  17. output_train = os.path.join("data", "prepared", "train.tsv")
  18. output_test = os.path.join("data", "prepared", "test.tsv")
  19. def process_posts(fd_in, fd_out_train, fd_out_test, target_tag):
  20. num = 1
  21. for line in fd_in:
  22. try:
  23. fd_out = fd_out_train if random.random() > split else fd_out_test
  24. attr = xml.etree.ElementTree.fromstring(line).attrib
  25. pid = attr.get("Id", "")
  26. label = 1 if target_tag in attr.get("Tags", "") else 0
  27. title = re.sub(r"\s+", " ", attr.get("Title", "")).strip()
  28. body = re.sub(r"\s+", " ", attr.get("Body", "")).strip()
  29. text = title + " " + body
  30. fd_out.write("{}\t{}\t{}\n".format(pid, label, text))
  31. num += 1
  32. except Exception as ex:
  33. sys.stderr.write(f"Skipping the broken line {num}: {ex}\n")
  34. os.makedirs(os.path.join("data", "prepared"), exist_ok=True)
  35. with io.open(input, encoding="utf8") as fd_in:
  36. with io.open(output_train, "w", encoding="utf8") as fd_out_train:
  37. with io.open(output_test, "w", encoding="utf8") as fd_out_test:
  38. process_posts(fd_in, fd_out_train, fd_out_test, "<python>")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...