Refactor the initialization of GUC parameters.

higuoxing · higuoxing · commit 4612b4f655a7 · 2024-03-08T07:59:49.000+08:00
Managing GUC parameters in different places is hard to maintain. This
patch organizes GUC definitions in a single place. Also, we use
define_xxx_guc() APIs to define these parameters and it will allow us
to manage GucContext, GucFlags in future.

P.S., the test case test_trusted_model doesn't seem correct. I fixed it
in this patch.
diff --git a/pgml-extension/Cargo.lock b/pgml-extension/Cargo.lock
diff --git a/pgml-extension/Cargo.toml b/pgml-extension/Cargo.toml
@@ -49,6 +49,7 @@ serde = { version = "1.0" }
 serde_json = { version = "1.0", features = ["preserve_order"] }
 typetag = "0.2"
 xgboost = { git = "https://github.com/postgresml/rust-xgboost", branch = "master" }
+lazy_static = "1.4.0"
 
 [dev-dependencies]
 pgrx-tests = "=0.11.2"
diff --git a/pgml-extension/src/bindings/python/mod.rs b/pgml-extension/src/bindings/python/mod.rs
@@ -6,11 +6,9 @@ use pgrx::*;
 use pyo3::prelude::*;
 use pyo3::types::PyTuple;
 
-use crate::config::get_config;
+use crate::config::PGML_VENV;
 use crate::create_pymodule;
 
-static CONFIG_NAME: &str = "pgml.venv";
-
 create_pymodule!("/src/bindings/python/python.py");
 
 pub fn activate_venv(venv: &str) -> Result<bool> {
@@ -23,8 +21,8 @@ pub fn activate_venv(venv: &str) -> Result<bool> {
 }
 
 pub fn activate() -> Result<bool> {
-    match get_config(CONFIG_NAME) {
-        Some(venv) => activate_venv(&venv),
+    match PGML_VENV.1.get() {
+        Some(venv) => activate_venv(&venv.to_string_lossy()),
         None => Ok(false),
     }
 }
diff --git a/pgml-extension/src/bindings/transformers/whitelist.rs b/pgml-extension/src/bindings/transformers/whitelist.rs
@@ -1,47 +1,54 @@
 use anyhow::{bail, Error};
+use pgrx::GucSetting;
 #[cfg(any(test, feature = "pg_test"))]
 use pgrx::{pg_schema, pg_test};
 use serde_json::Value;
+use std::ffi::CStr;
 
-use crate::config::get_config;
-
-static CONFIG_HF_WHITELIST: &str = "pgml.huggingface_whitelist";
-static CONFIG_HF_TRUST_REMOTE_CODE_BOOL: &str = "pgml.huggingface_trust_remote_code";
-static CONFIG_HF_TRUST_WHITELIST: &str = "pgml.huggingface_trust_remote_code_whitelist";
+use crate::config::{PGML_HF_TRUST_REMOTE_CODE, PGML_HF_TRUST_WHITELIST, PGML_HF_WHITELIST};
 
 /// Verify that the model in the task JSON is allowed based on the huggingface whitelists.
 pub fn verify_task(task: &Value) -> Result<(), Error> {
     let task_model = match get_model_name(task) {
         Some(model) => model.to_string(),
         None => return Ok(()),
     };
-    let whitelisted_models = config_csv_list(CONFIG_HF_WHITELIST);
+    let whitelisted_models = config_csv_list(&PGML_HF_WHITELIST.1);
 
     let model_is_allowed = whitelisted_models.is_empty() || whitelisted_models.contains(&task_model);
     if !model_is_allowed {
-        bail!("model {task_model} is not whitelisted. Consider adding to {CONFIG_HF_WHITELIST} in postgresql.conf");
+        bail!(
+            "model {} is not whitelisted. Consider adding to {} in postgresql.conf",
+            task_model,
+            PGML_HF_WHITELIST.0
+        );
     }
 
     let task_trust = get_trust_remote_code(task);
-    let trust_remote_code = get_config(CONFIG_HF_TRUST_REMOTE_CODE_BOOL)
-        .map(|v| v == "true")
-        .unwrap_or(true);
+    let trust_remote_code = PGML_HF_TRUST_REMOTE_CODE.1.get();
 
-    let trusted_models = config_csv_list(CONFIG_HF_TRUST_WHITELIST);
+    let trusted_models = config_csv_list(&PGML_HF_TRUST_WHITELIST.1);
 
     let model_is_trusted = trusted_models.is_empty() || trusted_models.contains(&task_model);
 
     let remote_code_allowed = trust_remote_code && model_is_trusted;
     if !remote_code_allowed && task_trust == Some(true) {
-        bail!("model {task_model} is not trusted to run remote code. Consider setting {CONFIG_HF_TRUST_REMOTE_CODE_BOOL} = 'true' or adding {task_model} to {CONFIG_HF_TRUST_WHITELIST}");
+        bail!(
+            "model {} is not trusted to run remote code. Consider setting {} = 'true' or adding {} to {}",
+            task_model,
+            PGML_HF_TRUST_REMOTE_CODE.0,
+            task_model,
+            PGML_HF_TRUST_WHITELIST.0
+        );
     }
 
     Ok(())
 }
 
-fn config_csv_list(name: &str) -> Vec<String> {
-    match get_config(name) {
+fn config_csv_list(csv_list: &GucSetting<Option<&'static CStr>>) -> Vec<String> {
+    match csv_list.get() {
         Some(value) => value
+            .to_string_lossy()
             .trim_matches('"')
             .split(',')
             .filter_map(|s| if s.is_empty() { None } else { Some(s.to_string()) })
@@ -122,7 +129,7 @@ mod tests {
     #[pg_test]
     fn test_empty_whitelist() {
         let model = "Salesforce/xgen-7b-8k-inst";
-        set_config(CONFIG_HF_WHITELIST, "").unwrap();
+        set_config(PGML_HF_WHITELIST.0, "").unwrap();
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_ok());
@@ -131,12 +138,12 @@ mod tests {
     #[pg_test]
     fn test_nonempty_whitelist() {
         let model = "Salesforce/xgen-7b-8k-inst";
-        set_config(CONFIG_HF_WHITELIST, model).unwrap();
+        set_config(PGML_HF_WHITELIST.0, model).unwrap();
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_ok());
 
-        set_config(CONFIG_HF_WHITELIST, "other_model").unwrap();
+        set_config(PGML_HF_WHITELIST.0, "other_model").unwrap();
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_err());
@@ -145,18 +152,18 @@ mod tests {
     #[pg_test]
     fn test_trusted_model() {
         let model = "Salesforce/xgen-7b-8k-inst";
-        set_config(CONFIG_HF_WHITELIST, model).unwrap();
-        set_config(CONFIG_HF_TRUST_WHITELIST, model).unwrap();
+        set_config(PGML_HF_WHITELIST.0, model).unwrap();
+        set_config(PGML_HF_TRUST_WHITELIST.0, model).unwrap();
 
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_ok());
 
         let task_json = format!(json_template!(), model, true);
         let task: Value = serde_json::from_str(&task_json).unwrap();
-        assert!(verify_task(&task).is_ok());
+        assert!(verify_task(&task).is_err());
 
-        set_config(CONFIG_HF_TRUST_REMOTE_CODE_BOOL, "true").unwrap();
+        set_config(PGML_HF_TRUST_REMOTE_CODE.0, "true").unwrap();
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_ok());
@@ -169,8 +176,8 @@ mod tests {
     #[pg_test]
     fn test_untrusted_model() {
         let model = "Salesforce/xgen-7b-8k-inst";
-        set_config(CONFIG_HF_WHITELIST, model).unwrap();
-        set_config(CONFIG_HF_TRUST_WHITELIST, "other_model").unwrap();
+        set_config(PGML_HF_WHITELIST.0, model).unwrap();
+        set_config(PGML_HF_TRUST_WHITELIST.0, "other_model").unwrap();
 
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
@@ -180,7 +187,7 @@ mod tests {
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_err());
 
-        set_config(CONFIG_HF_TRUST_REMOTE_CODE_BOOL, "true").unwrap();
+        set_config(PGML_HF_TRUST_REMOTE_CODE.0, "true").unwrap();
         let task_json = format!(json_template!(), model, false);
         let task: Value = serde_json::from_str(&task_json).unwrap();
         assert!(verify_task(&task).is_ok());
diff --git a/pgml-extension/src/config.rs b/pgml-extension/src/config.rs
@@ -1,16 +1,58 @@
+use lazy_static::lazy_static;
+use pgrx::{GucContext, GucFlags, GucRegistry, GucSetting};
 use std::ffi::CStr;
 
 #[cfg(any(test, feature = "pg_test"))]
 use pgrx::{pg_schema, pg_test};
-use pgrx_pg_sys::AsPgCStr;
 
-pub fn get_config(name: &str) -> Option<String> {
-    // SAFETY: name is not null because it is a Rust reference.
-    let ptr = unsafe { pgrx_pg_sys::GetConfigOption(name.as_pg_cstr(), true, false) };
-    (!ptr.is_null()).then(move || {
-        // SAFETY: assuming pgrx_pg_sys is providing a valid, null terminated pointer.
-        unsafe { CStr::from_ptr(ptr) }.to_string_lossy().to_string()
-    })
+lazy_static! {
+    pub static ref PGML_VENV: (&'static str, GucSetting<Option<&'static CStr>>) =
+        ("pgml.venv", GucSetting::<Option<&'static CStr>>::new(None));
+    pub static ref PGML_HF_WHITELIST: (&'static str, GucSetting<Option<&'static CStr>>) = (
+        "pgml.huggingface_whitelist",
+        GucSetting::<Option<&'static CStr>>::new(None),
+    );
+    pub static ref PGML_HF_TRUST_REMOTE_CODE: (&'static str, GucSetting<bool>) =
+        ("pgml.huggingface_trust_remote_code", GucSetting::<bool>::new(false));
+    pub static ref PGML_HF_TRUST_WHITELIST: (&'static str, GucSetting<Option<&'static CStr>>) = (
+        "pgml.huggingface_trust_remote_code_whitelist",
+        GucSetting::<Option<&'static CStr>>::new(None),
+    );
+}
+
+pub fn initialize_server_params() {
+    GucRegistry::define_string_guc(
+        PGML_VENV.0,
+        "Python's virtual environment path",
+        "",
+        &PGML_VENV.1,
+        GucContext::Userset,
+        GucFlags::default(),
+    );
+    GucRegistry::define_string_guc(
+        PGML_HF_WHITELIST.0,
+        "Models allowed to be downloaded from huggingface",
+        "",
+        &PGML_HF_WHITELIST.1,
+        GucContext::Userset,
+        GucFlags::default(),
+    );
+    GucRegistry::define_bool_guc(
+        PGML_HF_TRUST_REMOTE_CODE.0,
+        "Whether model can execute remote codes",
+        "",
+        &PGML_HF_TRUST_REMOTE_CODE.1,
+        GucContext::Userset,
+        GucFlags::default(),
+    );
+    GucRegistry::define_string_guc(
+        PGML_HF_TRUST_WHITELIST.0,
+        "Models allowed to execute remote codes when pgml.hugging_face_trust_remote_code = 'on'",
+        "",
+        &PGML_HF_TRUST_WHITELIST.1,
+        GucContext::Userset,
+        GucFlags::default(),
+    );
 }
 
 #[cfg(any(test, feature = "pg_test"))]
@@ -26,17 +68,11 @@ pub fn set_config(name: &str, value: &str) -> Result<(), pgrx::spi::Error> {
 mod tests {
     use super::*;
 
-    #[pg_test]
-    fn read_config_max_connections() {
-        let name = "max_connections";
-        assert_eq!(get_config(name), Some("100".into()));
-    }
-
     #[pg_test]
     fn read_pgml_huggingface_whitelist() {
         let name = "pgml.huggingface_whitelist";
         let value = "meta-llama/Llama-2-7b";
         set_config(name, value).unwrap();
-        assert_eq!(get_config(name), Some(value.into()));
+        assert_eq!(PGML_HF_WHITELIST.1.get().unwrap().to_string_lossy(), value);
     }
 }
diff --git a/pgml-extension/src/lib.rs b/pgml-extension/src/lib.rs
@@ -24,6 +24,7 @@ extension_sql_file!("../sql/schema.sql", name = "schema");
 #[cfg(not(feature = "use_as_lib"))]
 #[pg_guard]
 pub extern "C" fn _PG_init() {
+    config::initialize_server_params();
     bindings::python::activate().expect("Error setting python venv");
     orm::project::init();
 }

Original file line number	Diff line number	Diff line change
`@@ -6,11 +6,9 @@ use pgrx::*;`
`6`	`6`	`use pyo3::prelude::*;`
`7`	`7`	`use pyo3::types::PyTuple;`
`8`	`8`
`9`		`-use crate::config::get_config;`
	`9`	`+use crate::config::PGML_VENV;`
`10`	`10`	`use crate::create_pymodule;`
`11`	`11`
`12`		`-static CONFIG_NAME: &str = "pgml.venv";`
`13`		`-`
`14`	`12`	`create_pymodule!("/src/bindings/python/python.py");`
`15`	`13`
`16`	`14`	`pub fn activate_venv(venv: &str) -> Result<bool> {`
`@@ -23,8 +21,8 @@ pub fn activate_venv(venv: &str) -> Result<bool> {`
`23`	`21`	`}`
`24`	`22`
`25`	`23`	`pub fn activate() -> Result<bool> {`
`26`		`- match get_config(CONFIG_NAME) {`
`27`		`- Some(venv) => activate_venv(&venv),`
	`24`	`+ match PGML_VENV.1.get() {`
	`25`	`+ Some(venv) => activate_venv(&venv.to_string_lossy()),`
`28`	`26`	`None => Ok(false),`
`29`	`27`	`}`
`30`	`28`	`}`
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,7 @@ extension_sql_file!("../sql/schema.sql", name = "schema");`
`24`	`24`	`#[cfg(not(feature = "use_as_lib"))]`
`25`	`25`	`#[pg_guard]`
`26`	`26`	`pub extern "C" fn _PG_init() {`
	`27`	`+ config::initialize_server_params();`
`27`	`28`	`bindings::python::activate().expect("Error setting python venv");`
`28`	`29`	`orm::project::init();`
`29`	`30`	`}`