Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 2.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  1. import pandas as pd
  2. import gcsfs
  3. import os
  4. from sklearn.model_selection import train_test_split
  5. from sklearn.feature_extraction.text import TfidfVectorizer
  6. from sklearn.linear_model import LogisticRegression
  7. from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score, precision_score, recall_score, \
  8. f1_score
  9. PROJECT_NAME = 'talos-project'
  10. GCLOUD_CRED_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS'
  11. CHUNK_SIZE = 5000
  12. TARGET_LABEL = 'is_top_decile'
  13. raw_df_path = 'rML-raw-data.csv'
  14. train_df_path = 'rML-train.csv'
  15. test_df_path = 'rML-test.csv'
  16. def get_remote_gs_wfs():
  17. print('Retreiving location of remote working file system...')
  18. stream = os.popen('dvc remote list --local')
  19. output = stream.read()
  20. remote_wfs_loc = output.split('\t')[1].split('\n')[0]
  21. return remote_wfs_loc
  22. def load_and_process_data(remote_wfs, random_state=42):
  23. fs = gcsfs.GCSFileSystem(project=PROJECT_NAME, token=os.environ[GCLOUD_CRED_ENV_VAR])
  24. with fs.open(os.path.join(remote_wfs, train_df_path), 'a') as train_f, fs.open(os.path.join(remote_wfs, test_df_path), 'a') as test_f:
  25. print('Loading data in chuncks...')
  26. for i, chunk in enumerate(pd.read_csv(os.path.join(remote_wfs,raw_df_path), chunksize=CHUNK_SIZE)):
  27. print(f'Processing chunk {i+1}...')
  28. processed_data = process(chunk)
  29. print('Splitting into train and test data...')
  30. train_chunk, test_chunk = train_test_split(processed_data, random_state=random_state, stratify=processed_data[TARGET_LABEL])
  31. print('Saving to cloud...')
  32. save_data(train_chunk, train_f, test_chunk, test_f, i)
  33. def process(chunk):
  34. df = chunk.copy()
  35. df = df.drop(columns=['id','author'])
  36. df = df.rename(columns={"selftext":"body","link_flair_text":"flair"})
  37. df['title_len'] = df.title.str.len()
  38. df['body_len'] = df.body.str.len()
  39. df['has_thumbnail'] = [0 if (x == 'self' or x == 'default') else 1 for x in df['thumbnail']]
  40. df = df.fillna({'body': "", 'flair': "None", 'body_len': 0})
  41. df['flair'] = ['Discussion' if (x == 'Discusssion') else x for x in df['flair']]
  42. df = pd.concat([df,pd.get_dummies(df['flair'], prefix='flair')],axis=1).drop(['flair'],axis=1)
  43. df['title_and_body'] = df['title'] + ' ' + df['body']
  44. return df
  45. def save_data(train_chunk, train_f, test_chunk, test_f, i):
  46. # We want to write the headers only once
  47. header = True if i == 0 else False
  48. train_chunk.to_csv(train_f, header=header, mode='a')
  49. test_chunk.to_csv(test_f, header=header, mode='a')
  50. if __name__ == '__main__':
  51. remote_wfs = get_remote_gs_wfs()
  52. load_and_process_data(remote_wfs)
  53. print('Loading and processing done!')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...