Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

merge_crawled_data-TCI.py 3.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
  1. """
  2. 整理TCI爬蟲結果檔
  3. """
  4. from pathlib import Path
  5. import datetime
  6. import pandas as pd
  7. # 設定相對路徑
  8. dbwebsite = "TCI_20240507"
  9. home_path = Path(__file__).parents[1]
  10. def process_folder(input_folder_path, data_type):
  11. """
  12. 合併各計畫爬蟲結果檔文件夾中的所有csv檔。
  13. 讀取子文件夾中的所有 CSV 文件,
  14. 將它們連接成一個單一的 DataFrame,並將 DataFrame 保存為
  15. 輸出文件夾中名稱與子文件夾相同的 CSV 文件。
  16. 參數:
  17. input_folder_path (str 或 Path): 包含子文件夾的輸入文件夾路徑。
  18. data_type (str): "bibData" 或是 "citation"
  19. """
  20. for folder_path in Path(input_folder_path).iterdir():
  21. # print(f'folder_path is {folder_path}')
  22. dbnm = folder_path.name
  23. dfs = []
  24. for file_item in folder_path.joinpath(f"{data_type}").glob("*.csv"):
  25. df = pd.read_csv(file_item, engine="python")
  26. dfs.append(df)
  27. if dfs:
  28. combined_df = pd.concat(dfs, ignore_index=True)
  29. output_folder_path = home_path.joinpath(
  30. f"crawled_data/temp/{dbwebsite}/combined/{dbnm}"
  31. )
  32. output_folder_path.mkdir(parents=True, exist_ok=True)
  33. combined_df.to_csv(
  34. output_folder_path.joinpath(f"{data_type}.csv"), index=False
  35. )
  36. def merge_and_export_data(output_folder_path):
  37. """
  38. 合併個計畫資料夾中的引用和書目資料,並將合併後的資料匯出為CSV檔案。
  39. Args:
  40. output_folder_path (str): 待合併的書目和引用資料檔的路徑位置。
  41. Returns:
  42. None
  43. """
  44. error_df = pd.DataFrame()
  45. for folder_path in Path(output_folder_path).iterdir():
  46. # Iterate over subfolders in the parent folder
  47. dbnm = folder_path.name # Get the name of the subfolder
  48. cit = pd.read_csv(f"{output_folder_path}/{dbnm}/citation.csv")
  49. try:
  50. cit.drop_duplicates(subset=["題名"], inplace=True)
  51. except Exception as e:
  52. print(f"{dbnm}: {e}")
  53. errdf = pd.DataFrame({"叢集編號": [dbnm]})
  54. error_df = pd.concat([error_df, errdf], axis=0)
  55. continue
  56. bib = pd.read_csv(f"{output_folder_path}/{dbnm}/bibData.csv")
  57. bib.drop_duplicates(subset=["題名"], inplace=True)
  58. df_m = bib.merge(cit, how="outer", on="題名", indicator=True)
  59. export_path = home_path.joinpath(f"crawled_data/export/{dbwebsite}")
  60. export_path.mkdir(parents=True, exist_ok=True)
  61. df_m.to_csv(export_path.joinpath(f"{dbnm}.csv"))
  62. formatted_date = datetime.date.today().strftime("%Y%m%d")
  63. error_df.to_csv(
  64. home_path.joinpath(f"資料有誤_{formatted_date}.csv"), index=False
  65. )
  66. if __name__ == "__main__":
  67. # 設定相對路徑
  68. input_folder_path = home_path.joinpath(f"crawled_data/temp/raw/{dbwebsite}")
  69. output_folder_path = home_path.joinpath(f"crawled_data/temp/{dbwebsite}/combined")
  70. for dbtype in ["bibData", "citation"]:
  71. process_folder(input_folder_path, dbtype)
  72. merge_and_export_data(output_folder_path)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...