Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

read_backendDB.py 1.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
  1. import pandas as pd
  2. from pathlib import Path
  3. '''
  4. 讀取後臺資料庫檔案,並轉存
  5. '''
  6. proj_path = Path.cwd().parents[0]
  7. print(f'working path is {str(proj_path)}')
  8. # pth_rawdb = sorted(proj_path.joinpath('backend-DB/raw').glob('*.csv'))[0]
  9. pth_rawdb = proj_path.joinpath('backend-DB/raw/srda20231016.xlsx')
  10. rawdb = pd.read_excel(pth_rawdb)
  11. rawdb['作者'] = rawdb['作者'].str.replace(' ', '') # 移除全形空白
  12. key_columns = ['出版年', '原始題目', '著作性質', '出處']
  13. rawdb[key_columns] = rawdb[key_columns].fillna(method='ffill')
  14. author = rawdb.dropna(subset=['作者']).groupby(key_columns, as_index=False)[
  15. '作者'].apply(lambda x: '; '.join(x.astype(str)))
  16. dataNb = rawdb.dropna(subset=['登錄號']).groupby(key_columns, as_index=False)[
  17. '登錄號'].apply(lambda x: '; '.join(x.astype(str)))
  18. # NOTE: 有人填錯欄位了
  19. cluster = rawdb.dropna(subset=['引用資料']).groupby(key_columns, as_index=False)[
  20. '引用資料'].apply(lambda x: '; '.join(x.astype(str)))
  21. df = rawdb.dropna(subset=['前台是否顯示']).drop(['作者', '登錄號', '引用資料'], axis=1)
  22. # 合併新的作者、叢集資訊
  23. merged_data = pd.merge(df, author, on=key_columns, how='left')
  24. merged_data = pd.merge(merged_data, dataNb, on=key_columns, how='left')
  25. merged_data = pd.merge(merged_data, cluster, on=key_columns, how='left')
  26. # arrange the order of columns
  27. merged_data = merged_data[rawdb.columns].reset_index(drop=True)
  28. # duplicated
  29. duplicate_rows = merged_data[merged_data.duplicated(
  30. subset=['出版年', '原始題目', '著作性質', '出處'], keep=False)]
  31. print(f'重複資料筆數: {duplicate_rows.shape[0]}')
  32. # Export
  33. expth = proj_path.joinpath('backend-DB/SRDA-BibData10.csv')
  34. merged_data.to_csv(expth, index=False, encoding='utf8')
  35. # TODO: 作者欄位要改成dict,輸出成json
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...