Skip to content

add a new example #14

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions examples/digits/run.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
-- This example trains models on the sklean digits dataset
-- which is a copy of the test set of the UCI ML hand-written digits datasets
-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
--
-- The final result after a few seconds of training is not terrible. Maybe not perfect
-- enough for mission critical applications, but it's telling how quickly "off the shelf"
-- solutions can solve problems these days.
SELECT pgml.load_dataset('digits');

-- view the dataset
SELECT * from pgml.digits;

-- train a simple model to classify the data
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target');

-- check out the predictions
SELECT target, pgml.predict('Handwritten Digit Image Classifier', image) AS prediction
FROM pgml.digits
LIMIT 10;

-- -- train some more models with different algorithms
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'svm');
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'random_forest');
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosting_trees');
-- TODO SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'dense_neural_network');
-- -- check out all that hard work
SELECT * FROM pgml.trained_models;

-- deploy the random_forest model for prediction use
SELECT pgml.deploy('Handwritten Digit Image Classifier', 'random_forest');
-- check out that throughput
SELECT * FROM pgml.deployed_models;

-- do some hyper param tuning
-- TODO SELECT pgml.hypertune(100, 'Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosted_trees');
-- deploy the "best" model for prediction use
SELECT pgml.deploy('Handwritten Digit Image Classifier', 'best_fit');

-- check out the improved predictions
SELECT target, pgml.predict('Handwritten Digit Image Classifier', image) AS prediction
FROM pgml.digits
LIMIT 10;
2 changes: 1 addition & 1 deletion pgml/pgml/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
def version():
return "0.4.1"
return "0.4.2"
21 changes: 21 additions & 0 deletions pgml/pgml/datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import plpy
from sklearn.datasets import load_digits as d

from pgml.sql import q
from pgml.exceptions import PgMLException

def load(source: str):
if source == "digits":
load_digits()
else:
raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].")
return "OK"

def load_digits():
dataset = d()
a = plpy.execute("DROP TABLE IF EXISTS pgml.digits")
a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")
a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""")
for X, y in zip(dataset["data"], dataset["target"]):
X = ",".join("%i" % x for x in list(X))
plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""")
141 changes: 107 additions & 34 deletions pgml/pgml/model.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from re import M
import plpy
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error, r2_score, f1_score, precision_score, recall_score

import pickle
import json

from pgml.exceptions import PgMLException
from pgml.sql import q

def flatten(S):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure this won't blow the stack on a large dataset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It’s called per row, and I haven’t seen datasets with more than 4D arrays.

if S == []:
return S
if isinstance(S[0], list):
return flatten(S[0]) + flatten(S[1:])
return S[:1] + flatten(S[1:])

class Project(object):
"""
Expand Down Expand Up @@ -124,6 +133,14 @@ def deployed_model(self):
self._deployed_model = Model.find_deployed(self.id)
return self._deployed_model

def deploy(self, algorithm_name):
model = None
if algorithm_name == "best_fit":
model = Model.find_by_project_and_best_fit(self)
else:
model = Model.find_by_project_id_and_algorithm_name(self.id, algorithm_name)
model.deploy()
return model

class Snapshot(object):
"""
Expand Down Expand Up @@ -178,7 +195,7 @@ def create(
plpy.execute(
f"""
CREATE TABLE pgml."snapshot_{snapshot.id}" AS
SELECT * FROM "{snapshot.relation_name}";
SELECT * FROM {snapshot.relation_name};
"""
)
snapshot.__dict__ = dict(
Expand Down Expand Up @@ -232,6 +249,7 @@ def data(self):
for column in columns:
x_.append(row[column])

x_ = flatten(x_) # TODO be smart about flattening X depending on algorithm
X.append(x_)
y.append(y_)

Expand Down Expand Up @@ -262,8 +280,7 @@ class Model(object):
status (str): The current status of the model, e.g. 'new', 'training' or 'successful'
created_at (Timestamp): when this model was created
updated_at (Timestamp): when this model was last updated
mean_squared_error (float):
r2_score (float):
metrics (dict): key performance indicators for the model
pickle (bytes): the serialized version of the model parameters
algorithm: the in memory version of the model parameters that can make predictions
"""
Expand Down Expand Up @@ -320,6 +337,63 @@ def find_deployed(cls, project_id: int):
model.__init__()
return model

@classmethod
def find_by_project_id_and_algorithm_name(cls, project_id: int, algorithm_name: str):
"""
Args:
project_id (int): The project id
algorithm_name (str): The algorithm
Returns:
Model: most recently created model that fits the criteria
"""
result = plpy.execute(
f"""
SELECT models.*
FROM pgml.models
WHERE algorithm_name = {q(algorithm_name)}
AND project_id = {q(project_id)}
ORDER by models.created_at DESC
LIMIT 1
"""
)
if len(result) == 0:
return None

model = Model()
model.__dict__ = dict(result[0])
model.__init__()
return model

@classmethod
def find_by_project_and_best_fit(cls, project: Project):
"""
Args:
project (Project): The project
Returns:
Model: the model with the best metrics for the project
"""
if project.objective == "regression":
metric = "mean_squared_error"
elif project.objective == "classification":
metric = "f1"

result = plpy.execute(
f"""
SELECT models.*
FROM pgml.models
WHERE project_id = {q(project.id)}
ORDER by models.metrics->>{q(metric)} DESC
Copy link
Contributor

@levkk levkk Apr 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not flatten normalize the structure into the table?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Relevant metrics are different depending on the objective. We could have another join table to hold just metrics per model, but that seems like overkill just yet.

Copy link
Contributor

@levkk levkk Apr 17, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was just thinking of making them nullable and only fill in the relevant columns for the model being trained.

LIMIT 1
"""
)
if len(result) == 0:
return None

model = Model()
model.__dict__ = dict(result[0])
model.__init__()
return model

def __init__(self):
self._algorithm = None
self._project = None
Expand All @@ -342,8 +416,13 @@ def algorithm(self):
else:
self._algorithm = {
"linear_regression": LinearRegression,
"linear_classification": LogisticRegression,
"svm_regression": SVR,
"svm_classification": SVC,
"random_forest_regression": RandomForestRegressor,
"random_forest_classification": RandomForestClassifier,
"gradient_boosting_trees_regression": GradientBoostingRegressor,
"gradient_boosting_trees_classification": GradientBoostingClassifier,
}[self.algorithm_name + "_" + self.project.objective]()

return self._algorithm
Expand All @@ -362,8 +441,14 @@ def fit(self, snapshot: Snapshot):

# Test
y_pred = self.algorithm.predict(X_test)
msq = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
metrics = {}
if self.project.objective == "regression":
metrics["mean_squared_error"] = mean_squared_error(y_test, y_pred)
metrics["r2"] = r2_score(y_test, y_pred)
elif self.project.objective == "classification":
metrics["f1"] = f1_score(y_test, y_pred, average="weighted")
metrics["precision"] = precision_score(y_test, y_pred, average="weighted")
metrics["recall"] = recall_score(y_test, y_pred, average="weighted")

# Save the model
self.__dict__ = dict(
Expand All @@ -372,8 +457,7 @@ def fit(self, snapshot: Snapshot):
UPDATE pgml.models
SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}',
status = 'successful',
mean_squared_error = {q(msq)},
r2_score = {q(r2)}
metrics = {q(json.dumps(metrics))}
WHERE id = {q(self.id)}
RETURNING *
"""
Expand All @@ -398,6 +482,7 @@ def predict(self, data: list):
Returns:
float or int: scores for regressions or ints for classifications
"""
# TODO: add metrics for tracking prediction volume/accuracy by model
return self.algorithm.predict(data)


Expand All @@ -406,6 +491,7 @@ def train(
objective: str,
relation_name: str,
y_column_name: str,
algorithm_name: str = "linear",
test_size: float or int = 0.1,
test_sampling: str = "random",
):
Expand All @@ -416,15 +502,14 @@ def train(
objective (str): Defaults to "regression". Valid values are ["regression", "classification"].
relation_name (str): the table or view that stores the training data
y_column_name (str): the column in the training data that acts as the label
algorithm (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "random_forest"].
algorithm_name (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "svm", "random_forest", "gradient_boosting"].
test_size (float or int, optional): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25.
test_sampling: (str, optional): How to sample to create the test data. Defaults to "random". Valid values are ["first", "last", "random"].
"""
if objective == "regression":
algorithms = ["linear", "random_forest"]
elif objective == "classification":
algorithms = ["random_forest"]
else:
if algorithm_name is None:
algorithm_name = "linear"

if objective not in ["regression", "classification"]:
raise PgMLException(
f"Unknown objective `{objective}`, available options are: regression, classification."
)
Expand All @@ -440,23 +525,11 @@ def train(
)

snapshot = Snapshot.create(relation_name, y_column_name, test_size, test_sampling)
deployed = Model.find_deployed(project.id)

# Let's assume that the deployed model is better for now.
best_model = deployed
best_error = best_model.mean_squared_error if best_model else None

for algorithm_name in algorithms:
model = Model.create(project, snapshot, algorithm_name)
model.fit(snapshot)
model = Model.create(project, snapshot, algorithm_name)
model.fit(snapshot)

# Find the better model and deploy that.
if best_error is None or model.mean_squared_error < best_error:
best_error = model.mean_squared_error
best_model = model

if deployed and deployed.id == best_model.id:
return "rolled back"
else:
best_model.deploy()
if project.deployed_model is None:
model.deploy()
return "deployed"
else:
return "not deployed"
Loading