diff --git a/.github/workflows/pgml-rds-proxy.yaml b/.github/workflows/pgml-rds-proxy.yaml new file mode 100644 index 000000000..cfffc4482 --- /dev/null +++ b/.github/workflows/pgml-rds-proxy.yaml @@ -0,0 +1,24 @@ +name: Build and release pgml-rds-proxy Docker image + +on: + workflow_dispatch: +jobs: + publish-proxy-docker-image: + strategy: + matrix: + os: ["buildjet-4vcpu-ubuntu-2204"] + runs-on: ${{ matrix.os }} + defaults: + run: + working-directory: packages/pgml-rds-proxy + steps: + - uses: actions/checkout@v2 + - name: Login to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push Docker image + run: | + bash build-docker-image.sh diff --git a/packages/pgml-rds-proxy/Dockerfile b/packages/pgml-rds-proxy/Dockerfile new file mode 100644 index 000000000..dadef5791 --- /dev/null +++ b/packages/pgml-rds-proxy/Dockerfile @@ -0,0 +1,10 @@ +FROM ubuntu:22.04 +ENV PGCAT_VERSION=2.0.0-alpha18 +RUN apt update && \ + apt install -y curl postgresql-client-common postgresql-client-14 && \ + apt clean +WORKDIR /pgml-rds-proxy +COPY --chown=root:root download-pgcat.sh download-pgcat.sh +COPY --chown=root:root run.sh run.sh +RUN bash download-pgcat.sh +ENTRYPOINT ["bash", "run.sh"] diff --git a/packages/pgml-rds-proxy/README.md b/packages/pgml-rds-proxy/README.md new file mode 100644 index 000000000..6cf35569b --- /dev/null +++ b/packages/pgml-rds-proxy/README.md @@ -0,0 +1,79 @@ +# pgml-rds-proxy + +A pgcat-based PostgreSQL proxy that allows to use PostgresML functions on managed PostgreSQL databases that may not have Internet access, like AWS RDS. + +## Getting started + +A Docker image is provided and is the easiest way to get started. To run the image, you can simply: + +```bash +docker run \ + -e DATABASE_URL=postgres://pg:ml@sql.cloud.postgresml.org:38042/pgml \ + -p 6432:6432 \ + ghcr.io/postgresml/pgml-rds-proxy:latest +``` + +**Note:** Replace the `DATABASE_URL` above with the `DATABASE_URL` of your own PostgresML database. + +If you're running this on EC2, make sure the instance is placed inside the same VPC as your RDS database and that the RDS database is allowed to make outbound connections to the EC2 instance. +The example above starts the proxy process on port 6432, so for your security group configuration, make sure the database can make outbound connections to the EC2 instance using TCP on port 6432. + +### Configure FDW + +We'll be using the Foreign Data Wrapper extension to connect from your RDS database to PostgresML, forwarding the connection through the proxy. If you're running the proxy on EC2, take note of the private IP +or DNS entry of the instance. + +Before proceeding, make sure you have the following extensions installed into your RDS database: + +```postgresql +CREATE EXTENSION IF NOT EXISTS dblink; +CREATE EXTENSION IF NOT EXISTS postgres_fdw; +``` + +Both of these require superuser, so make sure you're running these commands with a user that has the `rds_superuser` role. + +To create a foreign data wrapper connection, take your PostgresML host and port and replace the host with the private IP or DNS entry of the instance. + +```postgresql +CREATE SERVER postgresml +FOREIGN DATA WRAPPER postgres_fdw +OPTIONS ( + host '127.0.0.1', + port '6432', + dbname 'pgml' +); +``` + +Replace the value for `host` with the private IP or DNS entry of the EC2 instance running the proxy. Replace the `dbname` with the name of the database from your PostgresML database `DATABASE_URL`. + +#### User mapping + +PostgresML and the proxy requires authentication. For each user that will use the connection, create a user mapping, like so: + +```postgresql +CREATE USER MAPPING +FOR CURRENT_USER +SERVER postgresml +OPTIONS ( + user 'pg', + password 'ml' +); +``` + +Replace the values for `user` and `password` with the values from your PostgresML database `DATABASE_URL`. This example contains values that will only work with our demo server and aren't suitable for production. `CURRENT_USER` is a special PostgreSQL variable that's replaced by the name of the user running the command. If you want to create this mapping for other users, replace it with the name of the user/role. + +### Test the connection + +To test the connection, you can use `dblink`: + +``` +SELECT + * +FROM + dblink( + 'postgresml', + 'SELECT * FROM pgml.embed(''intfloat/e5-small'', ''embed this text'') AS embedding' +) AS t1(embedding real[386]); +``` + +If everything is configured correctly, you should see an array of 386 floating points, your first embedding generated using PostgresML on AWS RDS. Both dblink and the proxy makes efficient use of connections, so queries will be executed as fast as the network connection allows. diff --git a/packages/pgml-rds-proxy/build-docker-image.sh b/packages/pgml-rds-proxy/build-docker-image.sh new file mode 100644 index 000000000..ff78af0f4 --- /dev/null +++ b/packages/pgml-rds-proxy/build-docker-image.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# +# +# +set -ex + +docker run --privileged --rm tonistiigi/binfmt --install all +docker buildx create --use --name mybuilder || true +docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --tag ghcr.io/postgresml/pgml-rds-proxy:latest \ + --progress plain \ + --no-cache \ + --push \ + . diff --git a/packages/pgml-rds-proxy/download-pgcat.sh b/packages/pgml-rds-proxy/download-pgcat.sh new file mode 100644 index 000000000..26cb609e7 --- /dev/null +++ b/packages/pgml-rds-proxy/download-pgcat.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# Download the right version of pgcat for the architecture. +# +# Author: PostgresML +# License: MIT +# +architecture=$(arch) +name=$(uname) +url="https://static.postgresml.org/packages/pgcat" +version="$PGCAT_VERSION" +bin_name="pgcat2-$version.bin" + +if [[ -z "$version" ]]; then + echo "PGCAT_VERSION environment variable is not set" + exit 1 +fi + +if [[ "$architecture" == "aarch64" && "$name" == "Linux" ]]; then + url="${url}/arm64/$bin_name" +elif [[ "$architecture" == "x86_64" && "$name" == "Linux" ]]; then + url="${url}/amd64/$bin_name" +else + echo "Unsupported platform: ${name} ${architecture}" + exit 1 +fi + +echo "Downloading pgcat from $url" +curl -L -o /usr/local/bin/pgcat ${url} +chmod +x /usr/local/bin/pgcat diff --git a/packages/pgml-rds-proxy/run.sh b/packages/pgml-rds-proxy/run.sh new file mode 100644 index 000000000..0df30c75e --- /dev/null +++ b/packages/pgml-rds-proxy/run.sh @@ -0,0 +1,11 @@ +#!/bin/bash +# +# Configure pgcat from a DATABASE_URL environment variable and run it as PID 1. +# This will regenerate the configuration file every time so modifications to it won't be saved. +# +# If you want to modify the configuration file, generate it first and then run pgcat with `--config ` instead. +# +# Author: PostgresML +# License: MIT +# +exec /usr/local/bin/pgcat --database-url ${DATABASE_URL} diff --git a/pgml-sdks/pgml/Cargo.lock b/pgml-sdks/pgml/Cargo.lock index 11128b907..c9d5723db 100644 --- a/pgml-sdks/pgml/Cargo.lock +++ b/pgml-sdks/pgml/Cargo.lock @@ -1531,7 +1531,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "pgml" -version = "1.0.0" +version = "1.0.2" dependencies = [ "anyhow", "async-trait", @@ -1562,6 +1562,7 @@ dependencies = [ "tokio", "tracing", "tracing-subscriber", + "url", "uuid", "walkdir", ] diff --git a/pgml-sdks/pgml/Cargo.toml b/pgml-sdks/pgml/Cargo.toml index e78e7413a..ba4037bce 100644 --- a/pgml-sdks/pgml/Cargo.toml +++ b/pgml-sdks/pgml/Cargo.toml @@ -45,6 +45,7 @@ ctrlc = "3" inquire = "0.6" parking_lot = "0.12.1" once_cell = "1.19.0" +url = "2.5.0" [features] default = [] diff --git a/pgml-sdks/pgml/src/cli.rs b/pgml-sdks/pgml/src/cli.rs index 709e5c1ab..c77c9d5b0 100644 --- a/pgml-sdks/pgml/src/cli.rs +++ b/pgml-sdks/pgml/src/cli.rs @@ -10,7 +10,7 @@ use pyo3::prelude::*; use sqlx::{Acquire, Executor}; use std::io::Write; -/// PostgresML CLI +/// PostgresML CLI: configure your PostgresML deployments & create connections to remote data sources. #[cfg(feature = "python")] #[derive(Parser, Debug, Clone)] #[command(author, version, about, long_about = None, name = "pgml", bin_name = "pgml")] @@ -97,6 +97,13 @@ enum Subcommands { #[arg(long)] database_url: Option, }, + + /// Connect your database to PostgresML via dblink. + Remote { + /// DATABASE_URL. + #[arg(long, short)] + database_url: Option, + }, } enum Level { @@ -212,6 +219,10 @@ async fn cli_internal() -> anyhow::Result<()> { ) .await?; } + + Subcommands::Remote { database_url } => { + remote(database_url).await?; + } }; Ok(()) @@ -326,6 +337,49 @@ async fn connect( Ok(()) } +async fn remote(database_url: Option) -> anyhow::Result<()> { + let database_url = user_input!(database_url, "PostgresML DATABASE_URL"); + let database_url = url::Url::parse(&database_url)?; + let user = database_url.username(); + if user.is_empty() { + anyhow::bail!("user not found in DATABASE_URL"); + } + + let password = database_url.password(); + let password = if password.is_none() { + anyhow::bail!("password not found in DATABASE_URL"); + } else { + password.unwrap() + }; + + let host = database_url.host_str(); + let host = if host.is_none() { + anyhow::bail!("host not found in DATABASE_URL"); + } else { + host.unwrap() + }; + + let port = database_url.port(); + let port = if port.is_none() { + "6432".to_string() + } else { + port.unwrap().to_string() + }; + + let database = database_url.path().replace("/", ""); + + let sql = include_str!("sql/remote.sql") + .replace("{user}", user) + .replace("{password}", password) + .replace("{host}", host) + .replace("{db_name}", "postgresml") + .replace("{database_name}", &database) + .replace("{port}", &port); + + println!("{}", syntax_highlight(&sql)); + Ok(()) +} + fn syntax_highlight(text: &str) -> String { if !std::io::stdout().is_terminal() { return text.to_owned(); diff --git a/pgml-sdks/pgml/src/sql/remote.sql b/pgml-sdks/pgml/src/sql/remote.sql new file mode 100644 index 000000000..d44b7b84f --- /dev/null +++ b/pgml-sdks/pgml/src/sql/remote.sql @@ -0,0 +1,31 @@ + + CREATE EXTENSION IF NOT EXISTS postgres_fdw; + CREATE EXTENSION IF NOT EXISTS dblink; + + CREATE SERVER "{db_name}" + FOREIGN DATA WRAPPER postgres_fdw + OPTIONS ( + host '{host}', + port '{port}', + dbname '{database_name}' + ); + + CREATE USER MAPPING + FOR CURRENT_USER + SERVER "{db_name}" + OPTIONS ( + user '{user}', + password '{password}' + ); + + SELECT * FROM dblink( + '{db_name}', + 'SELECT pgml.embed(''intfloat/e5-small'', ''test postgresml embedding'') AS embedding' + ) AS t(embedding real[386]); + + CREATE FUNCTION pgml_embed_e5_small(text) RETURNS real[386] AS $$ + SELECT * FROM dblink( + '{db_name}', + 'SELECT pgml.embed(''intfloat/e5-small'', ''' || $1 || ''') AS embedding' + ) AS t(embedding real[386]); + $$ LANGUAGE SQL;