hsuwei
/
bibi
connected to https://github.com/hswei0/bibi.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
            import pandas as pd
import re
from pathlib import Path

# get the workspace path
# projPath = Path.cwd().parents[0]
projPath = Path.cwd()
print(f'專案路徑: {projPath}')
download_path = projPath.joinpath('crawled-data/wos')
# 合併檔案
dfs = []
# Loop through all CSV files in the folder
for file in download_path.rglob('*.txt'):
    # Read the CSV file into a dataframe
    df = pd.read_csv(file, sep='\t')
    # Extract the file name without the extension
    # file_name = re.search(r'.+/(.+)\.csv', str(file)).group(1)
    # df['file_name'] = file_name
    # Append the dataframe to the list of dataframes
    dfs.append(df)

# Combine all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)
# 是否核對過
combined_df['checked'] = ''
# 是否排除
combined_df['removed'] = ''
print(f'合併後資料筆數: {combined_df.shape[0]}')

# summarise cralwed data

# drop duplicates
print(
    f'重複數量: {combined_df.duplicated(subset=["title", "source", "cluster"]).sum()}')
combined_df.drop_duplicates(
    subset=['title', 'source', 'cluster'], inplace=True)

# 比對後臺資料庫
pth = projPath.joinpath('backend-DB/SRDA-BibData.csv')
srda_db = pd.read_csv(pth, skipinitialspace=True)
compare_bol = combined_df['title'].isin(srda_db['原始題目'])
combined_df.loc[compare_bol, 'inDB'] = 1

# Export the comebined data
combined_df.to_csv(projPath.joinpath(
    'crawled-data/airi2_combined.csv'), index=False)


# summarise cralwed data
cralwed_statistics = combined_df.groupby(['cluster']).agg(
    搜尋文章數目=pd.NamedAgg(column='title', aggfunc='count'),
    已確認數目=pd.NamedAgg(column='checked', aggfunc='sum'),
    已登錄後臺數目=pd.NamedAgg(column='inDB', aggfunc='sum'),
    排除數目=pd.NamedAgg(column='removed', aggfunc='sum')
).sort_values(by='搜尋文章數目', ascending=False)

metadata = pd.read_excel(projPath.joinpath('crawler_code/資料集清單+指令語法.xlsx'))
metadata = metadata[['叢集編號', '資料叢集', '簡稱']]

# merge metadata with crawled statistics
cralwed_statistics = metadata.merge(
    cralwed_statistics, left_on='叢集編號', right_on='cluster', how='left')
cralwed_statistics.to_csv(projPath.joinpath(
    'crawled-data/airi2_stats.csv'), index=True)