Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

merge_crawled_data-wos.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
  1. from pathlib import Path
  2. import pandas as pd
  3. import csv
  4. # 設定欄位長度上限為100萬個字元
  5. csv.field_size_limit(1000000)
  6. # 設定相對路徑
  7. dbwebsite = 'WOS'
  8. home_path = Path(__file__).parents[1]
  9. def merge_txt_files_to_csv(input_folder_path):
  10. """
  11. 將指定資料夾中的所有txt檔案合併成一個CSV檔案
  12. Args:
  13. input_folder_path (str): 包含txt檔案的資料夾路徑
  14. Returns:
  15. None
  16. """
  17. # 建立輸出CSV檔案的資料夾
  18. export_path = home_path.joinpath(f'crawled_data/export/{dbwebsite}')
  19. export_path.mkdir(parents=True, exist_ok=True)
  20. # 迴圈輸入資料夾中的所有子資料夾
  21. for folder_path in Path(input_folder_path).iterdir():
  22. if folder_path.is_dir():
  23. dbnm = folder_path.name # 資料集名稱
  24. dfs = []
  25. for f in folder_path.glob("*.txt"):
  26. try:
  27. df = pd.read_csv(f, sep='\t', engine='python', quoting=3)
  28. dfs.append(df)
  29. except ValueError as e:
  30. print(f"Error reading file {f}: {e}")
  31. # 有讀取到檔案才進行合併
  32. if len(dfs) > 0:
  33. # 將所有DataFrame合併
  34. combined_df = pd.concat(dfs, ignore_index=True)
  35. # 將合併後的DataFrame寫入CSV檔案
  36. combined_df.to_csv(export_path.joinpath(
  37. f'{dbnm}.csv'), index=False)
  38. print(f'檔案已合併完畢: 路徑為{export_path}')
  39. else:
  40. print(f'{dbnm}沒有抓到檔案')
  41. if __name__ == "__main__":
  42. # 設定輸入資料夾的相對路徑
  43. input_folder_path = home_path.joinpath(
  44. f'crawled_data/temp/raw/{dbwebsite}')
  45. # 呼叫合併函數
  46. merge_txt_files_to_csv(input_folder_path)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...