-
Notifications
You must be signed in to change notification settings - Fork 325
add a new example #14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
-- This example trains models on the sklean digits dataset | ||
-- which is a copy of the test set of the UCI ML hand-written digits datasets | ||
-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits | ||
-- | ||
-- The final result after a few seconds of training is not terrible. Maybe not perfect | ||
-- enough for mission critical applications, but it's telling how quickly "off the shelf" | ||
-- solutions can solve problems these days. | ||
SELECT pgml.load_dataset('digits'); | ||
|
||
-- view the dataset | ||
SELECT * from pgml.digits; | ||
|
||
-- train a simple model to classify the data | ||
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target'); | ||
|
||
-- check out the predictions | ||
SELECT target, pgml.predict('Handwritten Digit Image Classifier', image) AS prediction | ||
FROM pgml.digits | ||
LIMIT 10; | ||
|
||
-- -- train some more models with different algorithms | ||
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'svm'); | ||
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'random_forest'); | ||
SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosting_trees'); | ||
-- TODO SELECT pgml.train('Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'dense_neural_network'); | ||
-- -- check out all that hard work | ||
SELECT * FROM pgml.trained_models; | ||
|
||
-- deploy the random_forest model for prediction use | ||
SELECT pgml.deploy('Handwritten Digit Image Classifier', 'random_forest'); | ||
-- check out that throughput | ||
SELECT * FROM pgml.deployed_models; | ||
|
||
-- do some hyper param tuning | ||
-- TODO SELECT pgml.hypertune(100, 'Handwritten Digit Image Classifier', 'classification', 'pgml.digits', 'target', 'gradient_boosted_trees'); | ||
-- deploy the "best" model for prediction use | ||
SELECT pgml.deploy('Handwritten Digit Image Classifier', 'best_fit'); | ||
|
||
-- check out the improved predictions | ||
SELECT target, pgml.predict('Handwritten Digit Image Classifier', image) AS prediction | ||
FROM pgml.digits | ||
LIMIT 10; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
def version(): | ||
return "0.4.1" | ||
return "0.4.2" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import plpy | ||
from sklearn.datasets import load_digits as d | ||
|
||
from pgml.sql import q | ||
from pgml.exceptions import PgMLException | ||
|
||
def load(source: str): | ||
if source == "digits": | ||
load_digits() | ||
else: | ||
raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].") | ||
return "OK" | ||
|
||
def load_digits(): | ||
dataset = d() | ||
a = plpy.execute("DROP TABLE IF EXISTS pgml.digits") | ||
a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)") | ||
a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""") | ||
for X, y in zip(dataset["data"], dataset["target"]): | ||
X = ",".join("%i" % x for x in list(X)) | ||
plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,23 @@ | ||
from re import M | ||
import plpy | ||
from sklearn.linear_model import LinearRegression | ||
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier | ||
from sklearn.linear_model import LinearRegression, LogisticRegression | ||
from sklearn.svm import SVR, SVC | ||
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import mean_squared_error, r2_score | ||
from sklearn.metrics import mean_squared_error, r2_score, f1_score, precision_score, recall_score | ||
|
||
import pickle | ||
import json | ||
|
||
from pgml.exceptions import PgMLException | ||
from pgml.sql import q | ||
|
||
def flatten(S): | ||
if S == []: | ||
return S | ||
if isinstance(S[0], list): | ||
return flatten(S[0]) + flatten(S[1:]) | ||
return S[:1] + flatten(S[1:]) | ||
|
||
class Project(object): | ||
""" | ||
|
@@ -124,6 +133,14 @@ def deployed_model(self): | |
self._deployed_model = Model.find_deployed(self.id) | ||
return self._deployed_model | ||
|
||
def deploy(self, algorithm_name): | ||
model = None | ||
if algorithm_name == "best_fit": | ||
model = Model.find_by_project_and_best_fit(self) | ||
else: | ||
model = Model.find_by_project_id_and_algorithm_name(self.id, algorithm_name) | ||
model.deploy() | ||
return model | ||
|
||
class Snapshot(object): | ||
""" | ||
|
@@ -178,7 +195,7 @@ def create( | |
plpy.execute( | ||
f""" | ||
CREATE TABLE pgml."snapshot_{snapshot.id}" AS | ||
SELECT * FROM "{snapshot.relation_name}"; | ||
SELECT * FROM {snapshot.relation_name}; | ||
""" | ||
) | ||
snapshot.__dict__ = dict( | ||
|
@@ -232,6 +249,7 @@ def data(self): | |
for column in columns: | ||
x_.append(row[column]) | ||
|
||
x_ = flatten(x_) # TODO be smart about flattening X depending on algorithm | ||
X.append(x_) | ||
y.append(y_) | ||
|
||
|
@@ -262,8 +280,7 @@ class Model(object): | |
status (str): The current status of the model, e.g. 'new', 'training' or 'successful' | ||
created_at (Timestamp): when this model was created | ||
updated_at (Timestamp): when this model was last updated | ||
mean_squared_error (float): | ||
r2_score (float): | ||
metrics (dict): key performance indicators for the model | ||
pickle (bytes): the serialized version of the model parameters | ||
algorithm: the in memory version of the model parameters that can make predictions | ||
""" | ||
|
@@ -320,6 +337,63 @@ def find_deployed(cls, project_id: int): | |
model.__init__() | ||
return model | ||
|
||
@classmethod | ||
def find_by_project_id_and_algorithm_name(cls, project_id: int, algorithm_name: str): | ||
""" | ||
Args: | ||
project_id (int): The project id | ||
algorithm_name (str): The algorithm | ||
Returns: | ||
Model: most recently created model that fits the criteria | ||
""" | ||
result = plpy.execute( | ||
f""" | ||
SELECT models.* | ||
FROM pgml.models | ||
WHERE algorithm_name = {q(algorithm_name)} | ||
AND project_id = {q(project_id)} | ||
ORDER by models.created_at DESC | ||
LIMIT 1 | ||
""" | ||
) | ||
if len(result) == 0: | ||
return None | ||
|
||
model = Model() | ||
model.__dict__ = dict(result[0]) | ||
model.__init__() | ||
return model | ||
|
||
@classmethod | ||
def find_by_project_and_best_fit(cls, project: Project): | ||
""" | ||
Args: | ||
project (Project): The project | ||
Returns: | ||
Model: the model with the best metrics for the project | ||
""" | ||
if project.objective == "regression": | ||
metric = "mean_squared_error" | ||
elif project.objective == "classification": | ||
metric = "f1" | ||
|
||
result = plpy.execute( | ||
f""" | ||
SELECT models.* | ||
FROM pgml.models | ||
WHERE project_id = {q(project.id)} | ||
ORDER by models.metrics->>{q(metric)} DESC | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Relevant metrics are different depending on the objective. We could have another join table to hold just metrics per model, but that seems like overkill just yet. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was just thinking of making them nullable and only fill in the relevant columns for the model being trained. |
||
LIMIT 1 | ||
""" | ||
) | ||
if len(result) == 0: | ||
return None | ||
|
||
model = Model() | ||
model.__dict__ = dict(result[0]) | ||
model.__init__() | ||
return model | ||
|
||
def __init__(self): | ||
self._algorithm = None | ||
self._project = None | ||
|
@@ -342,8 +416,13 @@ def algorithm(self): | |
else: | ||
self._algorithm = { | ||
"linear_regression": LinearRegression, | ||
"linear_classification": LogisticRegression, | ||
"svm_regression": SVR, | ||
"svm_classification": SVC, | ||
"random_forest_regression": RandomForestRegressor, | ||
"random_forest_classification": RandomForestClassifier, | ||
"gradient_boosting_trees_regression": GradientBoostingRegressor, | ||
"gradient_boosting_trees_classification": GradientBoostingClassifier, | ||
}[self.algorithm_name + "_" + self.project.objective]() | ||
|
||
return self._algorithm | ||
|
@@ -362,8 +441,14 @@ def fit(self, snapshot: Snapshot): | |
|
||
# Test | ||
y_pred = self.algorithm.predict(X_test) | ||
msq = mean_squared_error(y_test, y_pred) | ||
r2 = r2_score(y_test, y_pred) | ||
metrics = {} | ||
if self.project.objective == "regression": | ||
metrics["mean_squared_error"] = mean_squared_error(y_test, y_pred) | ||
metrics["r2"] = r2_score(y_test, y_pred) | ||
elif self.project.objective == "classification": | ||
metrics["f1"] = f1_score(y_test, y_pred, average="weighted") | ||
metrics["precision"] = precision_score(y_test, y_pred, average="weighted") | ||
metrics["recall"] = recall_score(y_test, y_pred, average="weighted") | ||
|
||
# Save the model | ||
self.__dict__ = dict( | ||
|
@@ -372,8 +457,7 @@ def fit(self, snapshot: Snapshot): | |
UPDATE pgml.models | ||
SET pickle = '\\x{pickle.dumps(self.algorithm).hex()}', | ||
status = 'successful', | ||
mean_squared_error = {q(msq)}, | ||
r2_score = {q(r2)} | ||
metrics = {q(json.dumps(metrics))} | ||
WHERE id = {q(self.id)} | ||
RETURNING * | ||
""" | ||
|
@@ -398,6 +482,7 @@ def predict(self, data: list): | |
Returns: | ||
float or int: scores for regressions or ints for classifications | ||
""" | ||
# TODO: add metrics for tracking prediction volume/accuracy by model | ||
return self.algorithm.predict(data) | ||
|
||
|
||
|
@@ -406,6 +491,7 @@ def train( | |
objective: str, | ||
relation_name: str, | ||
y_column_name: str, | ||
algorithm_name: str = "linear", | ||
test_size: float or int = 0.1, | ||
test_sampling: str = "random", | ||
): | ||
|
@@ -416,15 +502,14 @@ def train( | |
objective (str): Defaults to "regression". Valid values are ["regression", "classification"]. | ||
relation_name (str): the table or view that stores the training data | ||
y_column_name (str): the column in the training data that acts as the label | ||
algorithm (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "random_forest"]. | ||
algorithm_name (str, optional): the algorithm used to implement the objective. Defaults to "linear". Valid values are ["linear", "svm", "random_forest", "gradient_boosting"]. | ||
test_size (float or int, optional): If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is set to the complement of the train size. If train_size is also None, it will be set to 0.25. | ||
test_sampling: (str, optional): How to sample to create the test data. Defaults to "random". Valid values are ["first", "last", "random"]. | ||
""" | ||
if objective == "regression": | ||
algorithms = ["linear", "random_forest"] | ||
elif objective == "classification": | ||
algorithms = ["random_forest"] | ||
else: | ||
if algorithm_name is None: | ||
algorithm_name = "linear" | ||
|
||
if objective not in ["regression", "classification"]: | ||
raise PgMLException( | ||
f"Unknown objective `{objective}`, available options are: regression, classification." | ||
) | ||
|
@@ -440,23 +525,11 @@ def train( | |
) | ||
|
||
snapshot = Snapshot.create(relation_name, y_column_name, test_size, test_sampling) | ||
deployed = Model.find_deployed(project.id) | ||
|
||
# Let's assume that the deployed model is better for now. | ||
best_model = deployed | ||
best_error = best_model.mean_squared_error if best_model else None | ||
|
||
for algorithm_name in algorithms: | ||
model = Model.create(project, snapshot, algorithm_name) | ||
model.fit(snapshot) | ||
model = Model.create(project, snapshot, algorithm_name) | ||
model.fit(snapshot) | ||
|
||
# Find the better model and deploy that. | ||
if best_error is None or model.mean_squared_error < best_error: | ||
best_error = model.mean_squared_error | ||
best_model = model | ||
|
||
if deployed and deployed.id == best_model.id: | ||
return "rolled back" | ||
else: | ||
best_model.deploy() | ||
if project.deployed_model is None: | ||
model.deploy() | ||
return "deployed" | ||
else: | ||
return "not deployed" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are you sure this won't blow the stack on a large dataset?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It’s called per row, and I haven’t seen datasets with more than 4D arrays.