Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

merge_crawled_data-ncl.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
  1. from pathlib import Path
  2. import pandas as pd
  3. import csv
  4. # 設定欄位長度上限為100萬個字元
  5. csv.field_size_limit(1000000)
  6. def merge_txt_files_to_csv(input_folder_path, output_folder_path):
  7. output_folder = Path(output_folder_path)
  8. output_folder.mkdir(parents=True, exist_ok=True)
  9. for folder_path in Path(input_folder_path).iterdir():
  10. print(f'folder_path is {folder_path}')
  11. if folder_path.is_dir():
  12. csv_file_name = folder_path.name + ".csv"
  13. csv_file_path = output_folder / csv_file_name
  14. dfs = []
  15. for f in folder_path.glob("*.csv"):
  16. try:
  17. # df = pd.read_csv(f, sep=',', engine='python',
  18. # on_bad_lines='warn' # 先忽略格式問題
  19. # )
  20. df = pd.read_csv(f, header=0, sep=',',
  21. quotechar='"',
  22. skipinitialspace=True,
  23. # quoting=csv.QUOTE_NONE,
  24. encoding='utf-8', engine='python',
  25. dtype=str,
  26. on_bad_lines='warn' # 先忽略格式問題
  27. )
  28. dfs.append(df)
  29. except ValueError as e:
  30. print(f"Error reading file {f}: {e}")
  31. if len(dfs) > 0:
  32. combined_df = pd.concat(dfs, ignore_index=True)
  33. combined_df.to_csv(csv_file_path, index=False)
  34. print("Done!")
  35. this_file_path = Path(__file__)
  36. input_folder_path = this_file_path.parents[1].joinpath(f"crawler_data")
  37. output_folder_path = this_file_path.parents[1].joinpath(
  38. f"crawler_data/out/NCL")
  39. merge_txt_files_to_csv(input_folder_path, output_folder_path)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...