Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

mol_data.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
  1. """ Module containing classes for loading molecular data"""
  2. from pathlib import Path
  3. from typing import Any, Dict, List, Tuple
  4. import pandas as pd
  5. import torch
  6. from rdkit import Chem
  7. from molbart.data.base import MoleculeListDataModule
  8. class ChemblDataModule(MoleculeListDataModule):
  9. """
  10. DataModule for Chembl dataset.
  11. The molecules and the lengths of the sequences
  12. are loaded from a pickled DataFrame
  13. """
  14. def _load_all_data(self) -> None:
  15. df = pd.read_pickle(self.dataset_path)
  16. self._all_data = {
  17. "molecules": df["molecules"].tolist(),
  18. "lengths": df["lengths"].tolist(),
  19. }
  20. self._set_split_indices_from_dataframe(df)
  21. def _transform_batch(
  22. self, batch: List[Dict[str, Any]], train: bool
  23. ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, List[str]]:
  24. smiles_batch = [{"smiles": Chem.MolToSmiles(item["molecules"])} for item in batch]
  25. return super()._transform_batch(smiles_batch, train)
  26. class ZincDataModule(MoleculeListDataModule):
  27. """
  28. DataModule for Zinc dataset.
  29. The molecules are read as SMILES from a number of
  30. csv files.
  31. """
  32. def _load_all_data(self) -> None:
  33. path = Path(self.dataset_path)
  34. if path.is_dir():
  35. dfs = [pd.read_csv(filename) for filename in path.iterdir()]
  36. df = pd.concat(dfs, ignore_index=True, copy=False)
  37. else:
  38. df = pd.read_csv(path)
  39. self._all_data = {"smiles": df["smiles"].tolist()}
  40. self._set_split_indices_from_dataframe(df)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...