Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

categorical_encoders.py 2.3 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
  1. from typing import List
  2. import numpy as np
  3. import pandas as pd
  4. from feature_engine.encoding import (
  5. OrdinalEncoder,
  6. OneHotEncoder,
  7. CountFrequencyEncoder,
  8. RareLabelEncoder
  9. )
  10. # --------------------------------------
  11. # カテゴリー値に変換
  12. # --------------------------------------
  13. def ordinal_encoder(
  14. df: pd.DataFrame,
  15. cat_col_names: List,
  16. ) -> pd.DataFrame:
  17. oe = OrdinalEncoder(
  18. encoding_method='arbitrary',
  19. variables=cat_col_names,
  20. )
  21. df = oe.fit_transform(df)
  22. return df
  23. # --------------------------------------
  24. # (0, 1)に変換
  25. # --------------------------------------
  26. def onehot_encoder(
  27. df: pd.DataFrame,
  28. cat_col_names: List,
  29. ) -> pd.DataFrame:
  30. ohe = OneHotEncoder(
  31. top_categories=10,
  32. drop_last=True,
  33. variables=cat_col_names
  34. )
  35. df_ohe = ohe.fit_transform(df[cat_col_names])
  36. df_ = pd.concat([df, df_ohe], axis=1)
  37. return df_
  38. # --------------------------------------
  39. # ラベルの出現頻度に変換
  40. # --------------------------------------
  41. def count_freq_encoder(
  42. df: pd.DataFrame,
  43. cat_col_names: List,
  44. ) -> pd.DataFrame:
  45. ce = CountFrequencyEncoder(
  46. encoding_method='frequency',
  47. variables=cat_col_names
  48. )
  49. df_ce = ce.fit_transform(df[cat_col_names])
  50. df_ce = df_ce.add_suffix("_count_freq")
  51. df_ = pd.concat([df, df_ce], axis=1)
  52. return df_
  53. # --------------------------------------
  54. # 出現頻度の低いラベルはまとめたカテゴリーに変換
  55. # --------------------------------------
  56. def rarelabel_encoder(
  57. df: pd.DataFrame,
  58. cat_col_names: List,
  59. ) -> pd.DataFrame:
  60. re = RareLabelEncoder(
  61. tol=0.10,
  62. n_categories=10,
  63. variables=cat_col_names
  64. )
  65. df = re.fit_transform(df)
  66. return df
  67. # --------------------------------------
  68. # 出現頻度のランクに変換
  69. # --------------------------------------
  70. def count_rank_encoder(
  71. df: pd.DataFrame,
  72. cat_col_names: List,
  73. ) -> pd.DataFrame:
  74. for col in cat_col_names:
  75. count_rank = df.groupby(col)[col].count().rank(ascending=False)
  76. df[f"{col}_count"] = df[col].map(count_rank)
  77. return df
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...