hsuwei
/
bibi
connected to https://github.com/hswei0/bibi.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
            """
this file is used to merge the csv files in the crawled-data folder
"""

from pathlib import Path
import re
import pandas as pd


home_path = Path(__file__).parents[1]


def merge_all_data(folder_path, website):

    dfs = []

    for file_path in Path(folder_path).iterdir():

        clsnm = file_path.name  # Get the name of the cluster
        # print(clsnm)
        df = pd.read_csv(file_path)
        df["cluster"] = re.sub(r"\.csv$", "", clsnm)
        dfs.append(df)

    # 將所有DataFrame合併
    combined_df = pd.concat(dfs, ignore_index=True)
    export_path = home_path.joinpath("crawled_data/export/ALL")
    export_path.mkdir(parents=True, exist_ok=True)
    combined_df.to_csv(export_path.joinpath(f"{website}.csv"), index=False)


if __name__ == "__main__":
    # 設定輸入資料夾的相對路徑
    website = "TCI_20240507"
    folder_path = home_path.joinpath(f"crawled_data/export/{website}")
    # 呼叫合併函數
    merge_all_data(folder_path, website)