In [1]:
Copied!
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
# Example dataset (XOR-like)
X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([0,1,1,0])
model = LogisticRegression()
model.fit(X, y)
preds = model.predict(X)
print("Predictions:", preds)
print("Accuracy:", accuracy_score(y, preds))
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
# Example dataset (XOR-like)
X = np.array([[0,0],[0,1],[1,0],[1,1]])
y = np.array([0,1,1,0])
model = LogisticRegression()
model.fit(X, y)
preds = model.predict(X)
print("Predictions:", preds)
print("Accuracy:", accuracy_score(y, preds))
Predictions: [0 0 0 0] Accuracy: 0.5
1.2 Regression¶
In [2]:
Copied!
from sklearn.linear_model import LinearRegression
# Simple regression example
X_reg = np.array([[1],[2],[3],[4],[5]]) # features
y_reg = np.array([2,4,5,4,5]) # targets
reg_model = LinearRegression()
reg_model.fit(X_reg, y_reg)
print("Coefficients:", reg_model.coef_)
print("Intercept:", reg_model.intercept_)
y_pred = reg_model.predict(X_reg)
print("Predictions:", y_pred)
from sklearn.linear_model import LinearRegression
# Simple regression example
X_reg = np.array([[1],[2],[3],[4],[5]]) # features
y_reg = np.array([2,4,5,4,5]) # targets
reg_model = LinearRegression()
reg_model.fit(X_reg, y_reg)
print("Coefficients:", reg_model.coef_)
print("Intercept:", reg_model.intercept_)
y_pred = reg_model.predict(X_reg)
print("Predictions:", y_pred)
Coefficients: [0.6] Intercept: 2.2 Predictions: [2.8 3.4 4. 4.6 5.2]
In [3]:
Copied!
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipeline = Pipeline([
("scaler", StandardScaler()),
("svc", SVC(kernel="linear"))
])
# Synthetic data
X2 = np.array([[1, 200],[2, 180],[3, 240],[4, 210]])
y2 = np.array([0,0,1,1])
pipeline.fit(X2, y2)
pred2 = pipeline.predict(X2)
pred2
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipeline = Pipeline([
("scaler", StandardScaler()),
("svc", SVC(kernel="linear"))
])
# Synthetic data
X2 = np.array([[1, 200],[2, 180],[3, 240],[4, 210]])
y2 = np.array([0,0,1,1])
pipeline.fit(X2, y2)
pred2 = pipeline.predict(X2)
pred2
Out[3]:
array([0, 0, 1, 1])
2.2 Model Selection and Cross-Validation¶
scikit-learn provides utilities for hyperparameter tuning, like GridSearchCV
or RandomizedSearchCV
.
In [4]:
Copied!
from sklearn.model_selection import GridSearchCV
param_grid = {
"svc__C": [0.1, 1, 10],
"svc__kernel": ["linear", "rbf"]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=2)
grid_search.fit(X2, y2)
print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
from sklearn.model_selection import GridSearchCV
param_grid = {
"svc__C": [0.1, 1, 10],
"svc__kernel": ["linear", "rbf"]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=2)
grid_search.fit(X2, y2)
print("Best Params:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
Best Params: {'svc__C': 0.1, 'svc__kernel': 'linear'} Best Score: 1.0
2.3 Feature Engineering¶
Feature engineering transforms raw data into features suitable for model training. scikit-learn has classes like PolynomialFeatures
, CountVectorizer
(for text), etc.
In [5]:
Copied!
from sklearn.preprocessing import PolynomialFeatures
X_poly = np.array([[2],[3],[4]])
poly = PolynomialFeatures(degree=2)
X_transformed = poly.fit_transform(X_poly)
print("Original:", X_poly)
print("Polynomial Features:\n", X_transformed)
from sklearn.preprocessing import PolynomialFeatures
X_poly = np.array([[2],[3],[4]])
poly = PolynomialFeatures(degree=2)
X_transformed = poly.fit_transform(X_poly)
print("Original:", X_poly)
print("Polynomial Features:\n", X_transformed)
Original: [[2] [3] [4]] Polynomial Features: [[ 1. 2. 4.] [ 1. 3. 9.] [ 1. 4. 16.]]
2.4 Common Metrics¶
In addition to accuracy, scikit-learn offers precision_score
, recall_score
, f1_score
, r2_score
, etc.
In [6]:
Copied!
from sklearn.metrics import precision_score, recall_score
y_true = [0, 1, 1, 0]
y_pred = [0, 1, 0, 0]
print("Precision:", precision_score(y_true, y_pred))
print("Recall: ", recall_score(y_true, y_pred))
from sklearn.metrics import precision_score, recall_score
y_true = [0, 1, 1, 0]
y_pred = [0, 1, 0, 0]
print("Precision:", precision_score(y_true, y_pred))
print("Recall: ", recall_score(y_true, y_pred))
Precision: 1.0 Recall: 0.5
3. Exercises ¶
Exercise 1: Classification¶
- Use any small dataset (or generate synthetic data) for a classification task.
- Train a logistic regression model.
- Print accuracy.
In [8]:
Copied!
# Your code here
#X_test = np.array([[...], [...], ...])
#y_test = np.array([...])
# logistic_model = LogisticRegression()
# logistic_model.fit(X_test, y_test)
# preds_test = logistic_model.predict(X_test)
# print(accuracy_score(y_test, preds_test))
# Your code here
#X_test = np.array([[...], [...], ...])
#y_test = np.array([...])
# logistic_model = LogisticRegression()
# logistic_model.fit(X_test, y_test)
# preds_test = logistic_model.predict(X_test)
# print(accuracy_score(y_test, preds_test))
Exercise 2: Pipeline¶
- Create a pipeline with a
StandardScaler
and aKNeighborsClassifier
. - Fit it on some toy dataset.
- Predict and evaluate the performance.
In [9]:
Copied!
# Your code here
# Your code here
Exercise 3: Grid Search¶
- Use
GridSearchCV
on any model of your choice (e.g.,SVC
). - Print the best parameters and best score.
In [10]:
Copied!
# Your code here
# Your code here
4. Real-World Applications ¶
Classification Tasks¶
- Spam Detection: Email text classification.
- Image Recognition: Digits dataset or complex images.
Regression Tasks¶
- House Price Prediction: Predicting real estate prices based on features.
- Stock Forecasting: Although more advanced time-series methods exist, scikit-learn can handle simple regression or feature-based approaches.
Clustering¶
- Customer Segmentation: Using KMeans or DBSCAN to group similar customers.
Model Deployment¶
- scikit-learn models can be saved (e.g.,
joblib
) and deployed within web applications for real-time inference.
scikit-learn’s consistent API and wide range of algorithms make it a go-to toolkit for ML in Python.