import pandas as pd

# Small example dataset used by the chapter snippets.
df = pd.DataFrame({
    'voltage': [22.1, 19.5, 24.3, 23.9, 20.1, 21.8, 22.5, 24.8, 23.0, 20.7],
    'travel_speed': [4.8, 5.5, 4.1, 4.0, 5.2, 4.9, 4.7, 3.9, 4.3, 5.1],
    'heat_input': [1.15, 0.92, 1.35, 1.42, 0.98, 1.10, 1.18, 1.48, 1.30, 1.02],
    'mean_pixel': [82.4, 61.7, 91.2, 96.5, 70.5, 79.2, 84.1, 98.0, 88.3, 73.4],
    'std_pixel': [11.2, 18.5, 13.4, 21.0, 15.8, 12.0, 10.9, 22.5, 14.1, 16.2],
    'defect_area': [0.0, 3.0, 0.0, 11.0, 1.0, 0.0, 0.5, 14.0, 4.2, 0.0],
    'tensile_strength_mpa': [512, 455, 530, 438, 470, 505, 498, 420, 462, 500],
    'is_defective': [0, 1, 0, 1, 1, 0, 1, 1, 1, 0],
    'defect_class': ['no_defect', 'porosity', 'no_defect', 'crack', 'porosity',
                     'no_defect', 'porosity', 'crack', 'porosity', 'no_defect'],
})

df

tensile_strength_mpa = 485.2

inspection_result = 'fail'

target = 'defect_length_mm'

target = 'defect_class'

predicted_strength = intercept + slope * weld_length_mm

predicted_strength = intercept
                     + weight_1 * voltage
                     + weight_2 * travel_speed
                     + weight_3 * heat_input

from sklearn.linear_model import LinearRegression

X = df[['voltage', 'travel_speed', 'heat_input']]
y = df['tensile_strength_mpa']

model = LinearRegression()
model.fit(X, y)

predictions = model.predict(X)

probability_of_defect = sigmoid(score)

if probability_of_defect >= 0.5:
    predict 'defective'
else:
    predict 'not_defective'

from sklearn.linear_model import LogisticRegression

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['is_defective']

model = LogisticRegression()
model.fit(X, y)

predicted_classes = model.predict(X)
predicted_probabilities = model.predict_proba(X)

Is mean_pixel < 70?
    yes -> Is defect_area > 12?
        yes -> predict 'defective'
        no  -> predict 'not_defective'
    no  -> predict 'not_defective'

from sklearn.tree import DecisionTreeClassifier

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['defect_class']

model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(X, y)

predictions = model.predict(X)

tree_1 predicts 'porosity'
tree_2 predicts 'crack'
tree_3 predicts 'porosity'

forest prediction = 'porosity'

from sklearn.ensemble import RandomForestClassifier

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['defect_class']

model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
)
model.fit(X, y)

predictions = model.predict(X)

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['is_defective']

model = make_pipeline(
    StandardScaler(),
    SVC(kernel='rbf')
)
model.fit(X, y)

predictions = model.predict(X)

"visible crack near edge"

from sklearn.naive_bayes import GaussianNB

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['defect_class']

model = GaussianNB()
model.fit(X, y)

predictions = model.predict(X)

voltage, travel_speed, heat_input -> hidden layers -> defect probability

weld image -> image filters -> learned features -> defect class

from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['defect_class']

model = make_pipeline(
    StandardScaler(),
    MLPClassifier(hidden_layer_sizes=(32, 16), random_state=42)
)
model.fit(X, y)

predictions = model.predict(X)

model_1 makes initial predictions
model_2 learns from model_1's errors
model_3 learns from remaining errors
final prediction combines all models

from sklearn.ensemble import GradientBoostingClassifier

X = df[['mean_pixel', 'std_pixel', 'defect_area']]
y = df['defect_class']

model = GradientBoostingClassifier(random_state=42)
model.fit(X, y)

predictions = model.predict(X)

cluster 0 -> stable process measurements
cluster 1 -> unusually high heat input
cluster 2 -> low brightness images

from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X = df[['voltage', 'travel_speed', 'mean_pixel']]

model = make_pipeline(
    StandardScaler(),
    KMeans(n_clusters=3, random_state=42)
)
cluster_ids = model.fit_predict(X)

df['cluster_id'] = cluster_ids

Algorithm Families - Code Snippets¶

Setup - run this first¶

Regression vs. Classification¶

When to Use Regression¶

When to Use Classification¶

Linear Regression¶

Logistic Regression¶

Decision Trees¶

Random Forest¶

Support Vector Machines¶

Naive Bayes¶

Neural Networks¶

Gradient Boosting¶

Clustering¶