Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

readtxt.py 2.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
  1. import pandas as pd
  2. import re
  3. from pathlib import Path
  4. # get the workspace path
  5. # projPath = Path.cwd().parents[0]
  6. projPath = Path.cwd()
  7. print(f'專案路徑: {projPath}')
  8. download_path = projPath.joinpath('crawled-data/wos')
  9. # 合併檔案
  10. dfs = []
  11. # Loop through all CSV files in the folder
  12. for file in download_path.rglob('*.txt'):
  13. # Read the CSV file into a dataframe
  14. df = pd.read_csv(file, sep='\t')
  15. # Extract the file name without the extension
  16. # file_name = re.search(r'.+/(.+)\.csv', str(file)).group(1)
  17. # df['file_name'] = file_name
  18. # Append the dataframe to the list of dataframes
  19. dfs.append(df)
  20. # Combine all dataframes into a single dataframe
  21. combined_df = pd.concat(dfs, ignore_index=True)
  22. # 是否核對過
  23. combined_df['checked'] = ''
  24. # 是否排除
  25. combined_df['removed'] = ''
  26. print(f'合併後資料筆數: {combined_df.shape[0]}')
  27. # summarise cralwed data
  28. # drop duplicates
  29. print(
  30. f'重複數量: {combined_df.duplicated(subset=["title", "source", "cluster"]).sum()}')
  31. combined_df.drop_duplicates(
  32. subset=['title', 'source', 'cluster'], inplace=True)
  33. # 比對後臺資料庫
  34. pth = projPath.joinpath('backend-DB/SRDA-BibData.csv')
  35. srda_db = pd.read_csv(pth, skipinitialspace=True)
  36. compare_bol = combined_df['title'].isin(srda_db['原始題目'])
  37. combined_df.loc[compare_bol, 'inDB'] = 1
  38. # Export the comebined data
  39. combined_df.to_csv(projPath.joinpath(
  40. 'crawled-data/airi2_combined.csv'), index=False)
  41. # summarise cralwed data
  42. cralwed_statistics = combined_df.groupby(['cluster']).agg(
  43. 搜尋文章數目=pd.NamedAgg(column='title', aggfunc='count'),
  44. 已確認數目=pd.NamedAgg(column='checked', aggfunc='sum'),
  45. 已登錄後臺數目=pd.NamedAgg(column='inDB', aggfunc='sum'),
  46. 排除數目=pd.NamedAgg(column='removed', aggfunc='sum')
  47. ).sort_values(by='搜尋文章數目', ascending=False)
  48. metadata = pd.read_excel(projPath.joinpath('crawler_code/資料集清單+指令語法.xlsx'))
  49. metadata = metadata[['叢集編號', '資料叢集', '簡稱']]
  50. # merge metadata with crawled statistics
  51. cralwed_statistics = metadata.merge(
  52. cralwed_statistics, left_on='叢集編號', right_on='cluster', how='left')
  53. cralwed_statistics.to_csv(projPath.joinpath(
  54. 'crawled-data/airi2_stats.csv'), index=True)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...