Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

zinc_utils.py 2.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
  1. from pathlib import Path
  2. import numpy as np
  3. import pandas as pd
  4. def number_of_mols(data_path):
  5. path = Path(data_path)
  6. idx_file_mapping = []
  7. if path.is_dir():
  8. num_lines = 0
  9. for f in path.iterdir():
  10. text = f.read_text()
  11. num_mols = len(text.split("\n")) - 1
  12. idx_file_mapping.append((num_lines, num_lines + num_mols, f))
  13. num_lines += num_mols
  14. else:
  15. text = path.read_text()
  16. num_lines = len(text.split("\n"))
  17. idx_file_mapping.append((0, num_lines, path))
  18. return num_lines, idx_file_mapping
  19. def read_df_slice(idxs, idx_file_mapping):
  20. """Read a slice of the dataset from disk by looking up the required files in the mapping
  21. Args:
  22. idxs (List[int]): Contiguous list of indices into the full dataset of molecules to read
  23. idx_file_mapping (dict): Mapping returned by number_of_mols function
  24. Returns:
  25. (pd.DataFrame): DataFrame of lines from dataset
  26. """
  27. file_idx_map = {}
  28. curr_idx = 0
  29. for start, end, file_path in idx_file_mapping:
  30. while curr_idx < len(idxs) and start <= idxs[curr_idx] < end:
  31. file_idx_map.setdefault(str(file_path), [])
  32. file_idx_map[str(file_path)].append(idxs[curr_idx] - start)
  33. curr_idx += 1
  34. dfs = []
  35. for file_path, file_idxs in file_idx_map.items():
  36. file_df = pd.read_csv(Path(file_path))
  37. df = file_df.iloc[file_idxs]
  38. dfs.append(df)
  39. df_slice = pd.concat(dfs, ignore_index=True, copy=False)
  40. return df_slice
  41. def read_zinc_slice(data_path, rank, num_gpus, batch_size):
  42. num_mols, idx_file_mapping = number_of_mols(data_path)
  43. rank_idxs = [idxs.tolist() for idxs in np.array_split(list(range(num_mols)), num_gpus)]
  44. # Drop last mols to ensure all processes have the same number of batches
  45. num_mols = min([len(idxs) for idxs in rank_idxs])
  46. num_mols = batch_size * (num_mols // batch_size)
  47. idxs = rank_idxs[rank][:num_mols]
  48. df_slice = read_df_slice(idxs, idx_file_mapping)
  49. print(f"Read {str(len(df_slice.index))} molecules for gpu {str(rank)}")
  50. # How this df is utilised needs to be determined
  51. # dataset = ZincSlice(df_slice)
  52. return dataset
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...