Skip to content

add the california housing example #16

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -211,4 +211,9 @@ Run the test:
psql -f sql/test.sql
```

One liner:
```
cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql
```

Make sure to run it exactly like this, from the root directory of the repo.
41 changes: 41 additions & 0 deletions examples/california_housing/run.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
-- This example trains models on the sklean california_housing dataset
-- which is a copy of the test set from the StatLib repository
-- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
--
-- This demonstrates using a table with individual columns as features
-- for regression.
SELECT pgml.load_dataset('california_housing');

-- view the dataset
SELECT * from pgml.california_housing;

-- train a simple model to classify the data
SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target');

-- check out the predictions
SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
FROM pgml.california_housing
LIMIT 10;

-- -- train some more models with different algorithms
SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm');
SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest');
SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees');
-- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network');
-- -- check out all that hard work
SELECT * FROM pgml.trained_models;

-- deploy the random_forest model for prediction use
SELECT pgml.deploy('California Housing Prediction', 'random_forest');
-- check out that throughput
SELECT * FROM pgml.deployed_models;

-- do some hyper param tuning
-- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees');
-- deploy the "best" model for prediction use
SELECT pgml.deploy('California Housing Prediction', 'best_fit');

-- check out the improved predictions
SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
FROM pgml.california_housing
LIMIT 10;
3 changes: 3 additions & 0 deletions examples/digits/run.sql
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
-- which is a copy of the test set of the UCI ML hand-written digits datasets
-- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
--
-- This demonstrates using a table with a single array feature column
-- for classification.
--
-- The final result after a few seconds of training is not terrible. Maybe not perfect
-- enough for mission critical applications, but it's telling how quickly "off the shelf"
-- solutions can solve problems these days.
Expand Down
28 changes: 26 additions & 2 deletions pgml/pgml/datasets.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,45 @@
import plpy
from sklearn.datasets import load_digits as d
import sklearn.datasets

from pgml.sql import q
from pgml.exceptions import PgMLException

def load(source: str):
if source == "digits":
load_digits()
elif source == "california_housing":
load_california_housing()
else:
raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].")
return "OK"

def load_digits():
dataset = d()
dataset = sklearn.datasets.load_digits()
a = plpy.execute("DROP TABLE IF EXISTS pgml.digits")
a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")
a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""")
for X, y in zip(dataset["data"], dataset["target"]):
X = ",".join("%i" % x for x in list(X))
plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""")

def load_california_housing():
dataset = sklearn.datasets.fetch_california_housing()
a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing")
a = plpy.execute("""
CREATE TABLE pgml.california_housing (
median_income FLOAT4, -- median income in block group
house_age FLOAT4, -- median house age in block group
avg_rooms FLOAT4, -- average number of rooms per household
avg_bedrooms FLOAT4, -- average number of bedrooms per household
population FLOAT4, -- block group population
avg_occupants FLOAT4, -- average number of household members
latitude FLOAT4, -- block group latitude
longitude FLOAT4, -- block group longitudetarget INTEGER
target FLOAT4
)""")
a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""")
for X, y in zip(dataset["data"], dataset["target"]):
plpy.execute(f"""
INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target)
VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""")

2 changes: 1 addition & 1 deletion sql/install.sql
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ $$ LANGUAGE plpython3u;
---
--- Predict
---
CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features NUMERIC[])
CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
AS $$
from pgml.model import Project
Expand Down