Machine Learning courses with 110+ Real-time projects Start Now!!
Program 1
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
# Load data
df = pd.read_csv("D://scikit_data/diabetes/diabetes_prediction_dataset.csv") # columns: Glucose, BMI, etc.
X = df.drop("Outcome", axis=1) # input
y = df["Outcome"] #output
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Use_label_encoder=False
# What it means: Prevents XGBoost from using its old internal label encoder (which caused warnings in newer versions).
# Why you need it: In older versions of XGBoost, categorical labels were encoded internally. Now, it's better to use pandas/scikit-learn encoders externally.
# Set to False to avoid a warning like:
#eval_metric='logloss'
#This sets the evaluation metric to be used during training.
#logloss (logarithmic loss) is a common metric for binary classification:
#It penalizes wrong confident predictions more than slightly wrong ones.
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))