Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.py 4.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  1. from dataclasses import *
  2. from functools import cached_property
  3. from sklearn.base import TransformerMixin
  4. from sklearn.preprocessing import LabelEncoder
  5. from yspecies.dataset import ExpressionDataset
  6. from yspecies.utils import *
  7. from loguru import logger
  8. @dataclass(frozen=True)
  9. class FeatureSelection:
  10. '''
  11. Class that contains parameters for feature selection
  12. '''
  13. samples: List[str] = field(default_factory=lambda: ["tissue", "species"])
  14. species: List[str] = field(default_factory=lambda: [])
  15. genes: List[str] = None #if None = takes all genes
  16. to_predict: str = "lifespan"
  17. categorical: List[str] = field(default_factory=lambda: ["tissue"])
  18. exclude_from_training: List[str] = field(default_factory=lambda: ["species"])#columns that should note be used for training
  19. genes_meta: pd.DataFrame = None #metada for genes, TODO: check if still needed
  20. select_by: str = "shap"
  21. importance_type: str = "gain"
  22. feature_perturbation: str = "tree_path_dependent"
  23. not_validated_species: List[str] = field(default_factory=lambda: [])
  24. @cached_property
  25. def species_non_categorical(self) -> List:
  26. return [s for s in self.species if s not in self.categorical]
  27. @property
  28. def has_categorical(self):
  29. return self.categorical is not None and len(self.categorical) > 0
  30. def prepare_for_training(self, df: pd.DataFrame):
  31. return df if self.exclude_from_training is None else df.drop(columns=self.exclude_from_training, errors="ignore")
  32. @property
  33. def y_name(self):
  34. '''
  35. Just for nice display in jupyter
  36. :return:
  37. '''
  38. return f"Y_{self.to_predict}"
  39. def _repr_html_(self):
  40. return f"<table border='2'>" \
  41. f"<caption> Selected feature columns <caption>" \
  42. f"<tr><th>Samples metadata</th><th>Species metadata</th><th>Genes</th><th>Predict label</th><th>not_validated species</th></tr>" \
  43. f"<tr><td>{str(self.samples)}</td><td>{str(self.species)}</td><td>{'all' if self.genes is None else str(self.genes)}</td><td>{str(self.to_predict)}</td><td>{self.not_validated_species}</td></tr>" \
  44. f"</table>"
  45. class EncodedFeatures:
  46. def __init__(self, features: FeatureSelection, samples: pd.DataFrame, genes_meta: pd.DataFrame = None):
  47. self.genes_meta = genes_meta
  48. self.features = features
  49. self.samples = samples
  50. if len(features.categorical) < 1:
  51. self.encoders = []
  52. else:
  53. self.encoders: Dict[str, LabelEncoder] = {f: LabelEncoder() for f in features.categorical}
  54. for col, encoder in self.encoders.items():
  55. col_encoded = col+"_encoded"
  56. self.samples[col_encoded] = encoder.fit_transform(samples[col].values)
  57. @cached_property
  58. def y(self) -> pd.Series:
  59. return self.samples[self.features.to_predict].rename(self.features.to_predict)
  60. @cached_property
  61. def X(self):
  62. return self.samples.drop(columns=[self.features.to_predict])
  63. def __repr__(self):
  64. #to fix jupyter freeze (see https://github.com/ipython/ipython/issues/9771 )
  65. return self._repr_html_()
  66. def _repr_html_(self):
  67. return f"<table><caption>top 10 * 100 features/samples</caption><tr><td>{self.features._repr_html_()}</td><tr><td>{show(self.samples,100,10)._repr_html_()}</td></tr>"
  68. @dataclass(frozen=True)
  69. class DataExtractor(TransformerMixin):
  70. '''
  71. Workflow stage which extracts Data from ExpressionDataset
  72. '''
  73. def fit(self, X, y=None) -> 'DataExtractor':
  74. return self
  75. def transform(self, to_extract: Tuple[ExpressionDataset, FeatureSelection]) -> EncodedFeatures:
  76. data, features = to_extract
  77. samples = data.extended_samples(features.samples, features.species)
  78. exp = data.expressions if features.genes is None else data.expressions[features.genes]
  79. X: pd.dataFrame = samples.join(exp, how="inner")
  80. samples = data.get_label(features.to_predict).join(X)
  81. return EncodedFeatures(features, samples, data.genes_meta)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...