Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

smiles_utils.py 2.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
  1. from typing import List, Tuple
  2. import numpy as np
  3. import pandas as pd
  4. from rdkit import Chem
  5. def canonicalize_smiles(smiles: str) -> str:
  6. """
  7. Canonicalize smiles and sort the (possible) multiple molcules.
  8. Args:
  9. smiles: Input SMILES string.
  10. Returns:
  11. Canonicalized SMILES string.
  12. """
  13. mol = Chem.MolFromSmiles(smiles)
  14. if not mol:
  15. return smiles
  16. smiles_canonical = Chem.MolToSmiles(mol)
  17. smiles_canonical = ".".join(sorted(smiles_canonical.split(".")))
  18. return smiles_canonical
  19. def inchi_key(smiles: str):
  20. """
  21. Get inchi key of input SMILES.
  22. Args:
  23. smiles: Input SMILES string
  24. Returns:
  25. Inchi-key of SMILES string or SMILES string if invalid rdkit molecule.
  26. """
  27. mol = Chem.MolFromSmiles(smiles)
  28. if not mol:
  29. return smiles
  30. return Chem.MolToInchiKey(mol)
  31. def uniqueify_sampled_smiles(
  32. sampled_smiles: List[np.ndarray],
  33. log_lhs: List[np.ndarray],
  34. n_unique_beams: int,
  35. ) -> Tuple[List[np.ndarray], List[np.ndarray]]:
  36. """
  37. Get unique SMILES and corresponding highest log-likelihood of each input.
  38. For beam_size > 1: Uniqueifying sampled molecules and select
  39. 'n_unique_beams'-top molecules.
  40. Args:
  41. sampled_smiles: list of top-k sampled SMILES
  42. log_lhs: list of top-k log-likelihoods
  43. n_unique_beams: upper limit on number of unique SMILES to return
  44. Returns:
  45. Tuple of lists with unique SMILES and their corresponding highest
  46. log-likelihoods.
  47. """
  48. sampled_smiles_unique = []
  49. log_lhs_unique = []
  50. for top_k_smiles, top_k_llhs in zip(sampled_smiles, log_lhs):
  51. top_k_mols = [Chem.MolFromSmiles(smi) for smi in top_k_smiles]
  52. top_k_smiles = [Chem.MolToSmiles(mol) for mol in top_k_mols if mol]
  53. top_k_llhs = [llhs for llhs, mol in zip(top_k_llhs, top_k_mols) if mol]
  54. top_k_mols = [mol for mol in top_k_mols if mol]
  55. top_k_unique = pd.DataFrame(
  56. {
  57. "smiles": top_k_smiles,
  58. "log_likelihood": top_k_llhs,
  59. "molecules": top_k_mols,
  60. }
  61. )
  62. top_k_unique.drop_duplicates(subset=["smiles"], keep="first", inplace=True)
  63. sampled_smiles_unique.append(list(top_k_unique["smiles"].values[0:n_unique_beams]))
  64. log_lhs_unique.append(list(top_k_unique["log_likelihood"].values[0:n_unique_beams]))
  65. return (
  66. sampled_smiles_unique,
  67. log_lhs_unique,
  68. )
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...