Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess_utilities.py 1.6 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
  1. import pandas as pd
  2. import os
  3. from os import listdir
  4. from os.path import join, splitext
  5. import sys
  6. def convert_tsv_to_pandas(file_name, path_raw):
  7. complete_path = join(path_raw, file_name)
  8. df = pd.read_csv(complete_path ,sep='\t')
  9. df.columns = ["label", "text"]
  10. return df
  11. def combine_all_files(path_raw, files_ext = '.tsv'):
  12. all_files = listdir(path_raw)
  13. list_all_dataframes = []
  14. for file_name in all_files:
  15. # Check the extention of the file
  16. if(file_name.endswith(files_ext)):
  17. # Create pandas dataframe
  18. current_df = convert_tsv_to_pandas(file_name, path_raw)
  19. # Get the name of the file without the extension
  20. language = splitext(file_name)[0]
  21. current_df['language'] = str(language)
  22. list_all_dataframes.append(current_df)
  23. # Concate all the dataframes
  24. final_df = pd.concat(list_all_dataframes)
  25. return final_df
  26. def data_preprocessor(raw_path, preprocessed_path,
  27. processed_file_name,
  28. files_ext = '.tsv', final_ext = '.csv'):
  29. # Create a single pandas dataframe for all the files
  30. combined_pandas = combine_all_files(raw_path, files_ext)
  31. # Convert 0 1 to negative and positive and change the label column
  32. labels = {0: 'negative', 1: 'positive'}
  33. combined_pandas['label'] = combined_pandas['label'].map(labels)
  34. # Save the final file to destination
  35. complete_preprocessed_path = join(preprocessed_path, processed_file_name+final_ext)
  36. combined_pandas.to_csv(complete_preprocessed_path, encoding='utf-8', index=False)
  37. print("Preprocessing Complete!")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...