Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

dibeties_prediction.py 5.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import seaborn as sns
  5. import warnings
  6. warnings.filterwarnings('ignore')
  7. import scipy.stats as stats
  8. from scipy.stats import norm
  9. from sklearn.model_selection import train_test_split
  10. from sklearn.metrics import f1_score,roc_auc_score,accuracy_score,precision_score,recall_score
  11. from sklearn import svm
  12. from sklearn.ensemble import GradientBoostingClassifier
  13. from sklearn.svm import SVC
  14. import xgboost as xg
  15. diabetes_dataset = pd.read_csv('diabetes_sl.csv')
  16. #replacing 0 values with mean values if data is normally distributed and with median values if its a skewed distribution
  17. diabetes_dataset['Glucose']=diabetes_dataset['Glucose'].replace(0,diabetes_dataset['Glucose'].mean())#normal distribution
  18. diabetes_dataset['BloodPressure']=diabetes_dataset['BloodPressure'].replace(0,diabetes_dataset['BloodPressure'].mean())#normal distribution
  19. diabetes_dataset['SkinThickness']=diabetes_dataset['SkinThickness'].replace(0,diabetes_dataset['SkinThickness'].median())#skewed distribution
  20. diabetes_dataset['Insulin']=diabetes_dataset['Insulin'].replace(0,diabetes_dataset['Insulin'].median())#skewed distribution
  21. diabetes_dataset['BMI']=diabetes_dataset['BMI'].replace(0,diabetes_dataset['BMI'].median())#skewed distribution
  22. # separating the data and labels
  23. X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
  24. Y = diabetes_dataset['Outcome']
  25. print(X)
  26. print(Y)
  27. X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)
  28. print(X.shape, X_train.shape, X_test.shape)
  29. import mlflow
  30. mlflow.set_tracking_uri("https://dagshub.com/Rasheedabanu606/MLOPS_Final.mlflow")
  31. import os
  32. os.environ["MLFLOW_TRACKING_URI"] = "https://dagshub.com/Rasheedabanu606/MLOPS_Final.mlflow"
  33. os.environ["MLFLOW_TRACKING_USERNAME"] = "Rasheedabanu606"
  34. os.environ["MLFLOW_TRACKING_PASSWORD"] = "8b59378253ce9855de513080bf422889a8559050"
  35. # Define the experiment name
  36. experiment_name = "diabetes_prediction_experiment"
  37. # Check if the experiment exists
  38. experiment = mlflow.get_experiment_by_name(experiment_name)
  39. if experiment is None:
  40. # If the experiment does not exist, create it
  41. mlflow.create_experiment(experiment_name)
  42. # Set the experiment
  43. mlflow.set_experiment(experiment_name)
  44. from sklearn.linear_model import LogisticRegression
  45. from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
  46. from sklearn.svm import SVC
  47. from sklearn.metrics import accuracy_score, recall_score, f1_score
  48. from sklearn.model_selection import GridSearchCV
  49. import mlflow
  50. # Assuming you have X_train, Y_train, X_test, Y_test defined
  51. # Define hyperparameters grid for each classifier
  52. param_grid = {
  53. 'LogisticRegression': {'C': [0.1, 1.0, 10.0]},
  54. 'RandomForestClassifier': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]},
  55. 'SVC': {'C': [0.1, 1.0, 10.0], 'kernel': ['linear', 'rbf']},
  56. 'GradientBoostingClassifier': {'n_estimators': [50, 100, 200], 'learning_rate': [0.1, 0.01]}
  57. }
  58. # List of classifiers
  59. al_list = [
  60. LogisticRegression(),
  61. RandomForestClassifier(),
  62. SVC(),
  63. GradientBoostingClassifier()
  64. ]
  65. for classifier in al_list:
  66. # Retrieve the hyperparameters grid for the current classifier
  67. params = param_grid[classifier._class.name_]
  68. # Perform GridSearchCV for hyperparameter tuning
  69. grid_search = GridSearchCV(classifier, params, scoring='accuracy', cv=3)
  70. grid_search.fit(X_train, Y_train)
  71. # Get the best estimator and its parameters
  72. best_estimator = grid_search.best_estimator_
  73. best_params = grid_search.best_params_
  74. # Use the best estimator to make predictions
  75. y_train_pred = best_estimator.predict(X_train)
  76. y_test_pred = best_estimator.predict(X_test)
  77. # Calculate metrics
  78. accuracy_train = accuracy_score(Y_train, y_train_pred)
  79. accuracy_test = accuracy_score(Y_test, y_test_pred)
  80. recall_train = recall_score(Y_train, y_train_pred)
  81. recall_test = recall_score(Y_test, y_test_pred)
  82. f1_train = f1_score(Y_train, y_train_pred)
  83. f1_test = f1_score(Y_test, y_test_pred)
  84. # Print metrics
  85. print(f'Metrics for {classifier._class.name_}:')
  86. print(f' Best Parameters: {best_params}')
  87. print(f' Accuracy (Train): {accuracy_train:.4f}')
  88. print(f' Accuracy (Test): {accuracy_test:.4f}')
  89. print(f' Recall (Train): {recall_train:.4f}')
  90. print(f' Recall (Test): {recall_test:.4f}')
  91. print(f' F1 Score (Train): {f1_train:.4f}')
  92. print(f' F1 Score (Test): {f1_test:.4f}')
  93. # MLflow logging
  94. with mlflow.start_run(run_name=str(classifier._class.name_)):
  95. mlflow.log_params(best_params)
  96. mlflow.log_metric("accuracy_train", accuracy_train)
  97. mlflow.log_metric("accuracy_test", accuracy_test)
  98. mlflow.log_metric("recall_train", recall_train)
  99. mlflow.log_metric("recall_test", recall_test)
  100. mlflow.log_metric("f1_train", f1_train)
  101. mlflow.log_metric("f1_test", f1_test)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...