hsuwei
/
bibi
connected to https://github.com/hswei0/bibi.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
            """
整理TCI爬蟲結果檔
"""

from pathlib import Path
import datetime
import pandas as pd

# 設定相對路徑
dbwebsite = "TCI_20240507"
home_path = Path(__file__).parents[1]


def process_folder(input_folder_path, data_type):
    """
    合併各計畫爬蟲結果檔文件夾中的所有csv檔。

    讀取子文件夾中的所有 CSV 文件,
    將它們連接成一個單一的 DataFrame,並將 DataFrame 保存為
    輸出文件夾中名稱與子文件夾相同的 CSV 文件。

    參數:
        input_folder_path (str 或 Path): 包含子文件夾的輸入文件夾路徑。
        data_type (str): "bibData" 或是 "citation"

    """

    for folder_path in Path(input_folder_path).iterdir():
        # print(f'folder_path is {folder_path}')

        dbnm = folder_path.name
        dfs = []

        for file_item in folder_path.joinpath(f"{data_type}").glob("*.csv"):
            df = pd.read_csv(file_item, engine="python")
            dfs.append(df)

        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            output_folder_path = home_path.joinpath(
                f"crawled_data/temp/{dbwebsite}/combined/{dbnm}"
            )
            output_folder_path.mkdir(parents=True, exist_ok=True)
            combined_df.to_csv(
                output_folder_path.joinpath(f"{data_type}.csv"), index=False
            )


def merge_and_export_data(output_folder_path):
    """
    合併個計畫資料夾中的引用和書目資料,並將合併後的資料匯出為CSV檔案。

    Args:
        output_folder_path (str): 待合併的書目和引用資料檔的路徑位置。

    Returns:
        None
    """
    error_df = pd.DataFrame()
    for folder_path in Path(output_folder_path).iterdir():
        # Iterate over subfolders in the parent folder
        dbnm = folder_path.name  # Get the name of the subfolder

        cit = pd.read_csv(f"{output_folder_path}/{dbnm}/citation.csv")

        try:
            cit.drop_duplicates(subset=["題名"], inplace=True)
        except Exception as e:
            print(f"{dbnm}: {e}")
            errdf = pd.DataFrame({"叢集編號": [dbnm]})
            error_df = pd.concat([error_df, errdf], axis=0)
            continue

        bib = pd.read_csv(f"{output_folder_path}/{dbnm}/bibData.csv")
        bib.drop_duplicates(subset=["題名"], inplace=True)

        df_m = bib.merge(cit, how="outer", on="題名", indicator=True)

        export_path = home_path.joinpath(f"crawled_data/export/{dbwebsite}")
        export_path.mkdir(parents=True, exist_ok=True)
        df_m.to_csv(export_path.joinpath(f"{dbnm}.csv"))
        formatted_date = datetime.date.today().strftime("%Y%m%d")
        error_df.to_csv(
            home_path.joinpath(f"資料有誤_{formatted_date}.csv"), index=False
        )


if __name__ == "__main__":
    # 設定相對路徑
    input_folder_path = home_path.joinpath(f"crawled_data/temp/raw/{dbwebsite}")
    output_folder_path = home_path.joinpath(f"crawled_data/temp/{dbwebsite}/combined")

    for dbtype in ["bibData", "citation"]:
        process_folder(input_folder_path, dbtype)

    merge_and_export_data(output_folder_path)