Site icon DataFlair

Introduction to XGBoost Algorithm

Machine Learning courses with 110+ Real-time projects Start Now!!

Program 1

Diabetes Prediction Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("D://scikit_data/diabetes/diabetes_prediction_dataset.csv")  # columns: Glucose, BMI, etc.
X = df.drop("Outcome", axis=1) # input
y = df["Outcome"] #output

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use_label_encoder=False
# What it means: Prevents XGBoost from using its old internal label encoder (which caused warnings in newer versions).
# Why you need it: In older versions of XGBoost, categorical labels were encoded internally. Now, it's better to use pandas/scikit-learn encoders externally.
# Set to False to avoid a warning like:

#eval_metric='logloss'
#This sets the evaluation metric to be used during training.
#logloss (logarithmic loss) is a common metric for binary classification:
#It penalizes wrong confident predictions more than slightly wrong ones.

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)
# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))
Exit mobile version