Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

feature_generation.py 1.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  1. from typing import List
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn.cluster import KMeans
  5. from feature_engine.datetime import DatetimeFeatures
  6. def create_math_transforms(
  7. df: pd.DataFrame
  8. ) -> pd.DataFrame:
  9. df["A_B_ratio"] = df["A"]/df["B"]
  10. return df
  11. def create_count_sum(
  12. df: pd.DataFrame
  13. ) -> pd.DataFrame:
  14. df["sum_of_ABCD"] = df[["A", "B", "C", "D"]].gt(0.0).sum(axis=1)
  15. return df
  16. def break_down_category(
  17. df: pd.DataFrame
  18. ) -> pd.DataFrame:
  19. df["A_prefix"] = df["A"].str.split("_", n=1, expand=True)[0]
  20. return df
  21. def create_grouped_tranforms(
  22. df: pd.DataFrame
  23. ) -> pd.DataFrame:
  24. df["A_group_median"] = df.groupby("Group")["A"].transform("median")
  25. return df
  26. def create_clustering_features(
  27. df: pd.DataFrame,
  28. target: str,
  29. ) -> pd.DataFrame:
  30. X = df.drop(columns=target)
  31. X_scaled = (X - X.mean(axis=0)) / X.std(axis=0)
  32. kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
  33. df["cluster_number"] = kmeans.fit_predict(X_scaled)
  34. return df
  35. def create_datetime_features(
  36. df: pd.DataFrame,
  37. date_col_names: List
  38. ) -> pd.DataFrame:
  39. dtfs = DatetimeFeatures(
  40. variables = date_col_names,
  41. features_to_extract=["month", "month_end", "day_of_year"],
  42. drop_original=True
  43. )
  44. df = dtfs.fit_transform(df)
  45. return df
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...