Merge pull request #16 from postgresml/montana/housing

levkk · web-flow · commit da81ecf6449c · 2022-04-17T18:25:04.000-07:00
add the california housing example
diff --git a/README.md b/README.md
@@ -211,4 +211,9 @@ Run the test:
 psql -f sql/test.sql
 ```
 
+One liner:
+```
+cd pgml; sudo python3 setup.py install; cd ../; sudo -u postgres psql -f sql/test.sql
+```
+
 Make sure to run it exactly like this, from the root directory of the repo.
diff --git a/examples/california_housing/run.sql b/examples/california_housing/run.sql
@@ -0,0 +1,41 @@
+-- This example trains models on the sklean california_housing dataset
+-- which is a copy of the test set from the StatLib repository
+-- https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html
+--
+-- This demonstrates using a table with individual columns as features
+-- for regression.
+SELECT pgml.load_dataset('california_housing');
+
+-- view the dataset
+SELECT * from pgml.california_housing;
+
+-- train a simple model to classify the data
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target');
+
+-- check out the predictions
+SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
+FROM pgml.california_housing 
+LIMIT 10;
+
+-- -- train some more models with different algorithms
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'svm');
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'random_forest');
+SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosting_trees');
+-- TODO SELECT pgml.train('California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'dense_neural_network');
+-- -- check out all that hard work
+SELECT * FROM pgml.trained_models;
+
+-- deploy the random_forest model for prediction use
+SELECT pgml.deploy('California Housing Prediction', 'random_forest');
+-- check out that throughput
+SELECT * FROM pgml.deployed_models;
+
+-- do some hyper param tuning
+-- TODO SELECT pgml.hypertune(100, 'California Housing Prediction', 'regression', 'pgml.california_housing', 'target', 'gradient_boosted_trees');
+-- deploy the "best" model for prediction use
+SELECT pgml.deploy('California Housing Prediction', 'best_fit');
+
+-- check out the improved predictions
+SELECT target, pgml.predict('California Housing Prediction', ARRAY[median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude]) AS prediction
+FROM pgml.california_housing 
+LIMIT 10;
diff --git a/examples/digits/run.sql b/examples/digits/run.sql
@@ -2,6 +2,9 @@
 -- which is a copy of the test set of the UCI ML hand-written digits datasets
 -- https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits
 --
+-- This demonstrates using a table with a single array feature column
+-- for classification.
+--
 -- The final result after a few seconds of training is not terrible. Maybe not perfect
 -- enough for mission critical applications, but it's telling how quickly "off the shelf" 
 -- solutions can solve problems these days.
diff --git a/pgml/pgml/datasets.py b/pgml/pgml/datasets.py
@@ -1,21 +1,45 @@
 import plpy
-from sklearn.datasets import load_digits as d
+import sklearn.datasets
 
 from pgml.sql import q
 from pgml.exceptions import PgMLException
 
 def load(source: str):
     if source == "digits":
         load_digits()
+    elif source == "california_housing":
+        load_california_housing()
     else:
         raise PgMLException(f"Invalid dataset name: {source}. Valid values are ['digits'].")
     return "OK"
 
 def load_digits():
-    dataset = d()
+    dataset = sklearn.datasets.load_digits()
     a = plpy.execute("DROP TABLE IF EXISTS pgml.digits")
     a = plpy.execute("CREATE TABLE pgml.digits (image SMALLINT[], target INTEGER)")
     a = plpy.execute(f"""COMMENT ON TABLE pgml.digits IS {q(dataset["DESCR"])}""")
     for X, y in zip(dataset["data"], dataset["target"]):
         X = ",".join("%i" % x for x in list(X))
         plpy.execute(f"""INSERT INTO pgml.digits (image, target) VALUES ('{{{X}}}', {y})""")
+
+def load_california_housing():
+    dataset = sklearn.datasets.fetch_california_housing()
+    a = plpy.execute("DROP TABLE IF EXISTS pgml.california_housing")
+    a = plpy.execute("""
+        CREATE TABLE pgml.california_housing (
+            median_income FLOAT4, -- median income in block group
+            house_age FLOAT4, -- median house age in block group
+            avg_rooms FLOAT4, -- average number of rooms per household
+            avg_bedrooms FLOAT4, -- average number of bedrooms per household
+            population FLOAT4, -- block group population
+            avg_occupants FLOAT4, -- average number of household members
+            latitude FLOAT4, -- block group latitude
+            longitude FLOAT4, -- block group longitudetarget INTEGER
+            target FLOAT4
+        )""")
+    a = plpy.execute(f"""COMMENT ON TABLE pgml.california_housing IS {q(dataset["DESCR"])}""")
+    for X, y in zip(dataset["data"], dataset["target"]):
+        plpy.execute(f"""
+            INSERT INTO pgml.california_housing (median_income, house_age, avg_rooms, avg_bedrooms, population, avg_occupants, latitude, longitude, target) 
+            VALUES ({q(X[0])}, {q(X[1])}, {q(X[2])}, {q(X[3])}, {q(X[4])}, {q(X[5])}, {q(X[6])}, {q(X[7])}, {q(y)})""")
+    
diff --git a/sql/install.sql b/sql/install.sql
@@ -136,7 +136,7 @@ $$ LANGUAGE plpython3u;
 ---
 --- Predict
 ---
-CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features NUMERIC[])
+CREATE OR REPLACE FUNCTION pgml.predict(project_name TEXT, features DOUBLE PRECISION[])
 RETURNS DOUBLE PRECISION
 AS $$
 	from pgml.model import Project