Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

xml_to_tsv.py 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
  1. import sys
  2. import os
  3. import xml.etree.ElementTree
  4. import conf
  5. import dask
  6. import dask.distributed
  7. client = dask.distributed.Client('localhost:8786')
  8. INPUT = conf.source_xml
  9. OUTPUT = conf.source_tsv
  10. @dask.delayed
  11. def workflow(input_path, output_path):
  12. def print_usage(msg):
  13. if msg:
  14. sys.stderr.write('{}\n'.format(msg))
  15. sys.stderr.write('Usage:\n')
  16. sys.stderr.write('\tpython posts_to_tsv.py\n')
  17. def process_posts(fd_in, fd_out, target_tag):
  18. num = 1
  19. for line in fd_in:
  20. try:
  21. attr = xml.etree.ElementTree.fromstring(line).attrib
  22. id = attr.get('Id', '')
  23. label = 1 if target_tag in attr.get('Tags', '') else 0
  24. title = attr.get('Title', '').replace('\t', ' ').replace(
  25. '\n', ' ').replace('\r', ' ')
  26. body = attr.get('Body', '').replace('\t', ' ').replace(
  27. '\n', ' ').replace('\r', ' ')
  28. text = title + ' ' + body
  29. fd_out.write(u'{}\t{}\t{}\n'.format(id, label, text))
  30. num += 1
  31. except Exception as ex:
  32. sys.stderr.write('Error in line {}: {}\n'.format(num, ex))
  33. TAG = 'python'
  34. target_tag = u'<' + TAG + '>'
  35. if not os.path.exists(input_path):
  36. print_usage('Input file {} does not exist'.format(input_path))
  37. sys.exit(1)
  38. with open(input_path) as fd_in:
  39. with open(output_path, 'w') as fd_out:
  40. process_posts(fd_in, fd_out, target_tag)
  41. if __name__ == '__main__':
  42. workflow(INPUT, OUTPUT).compute()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...