postgresml
diff --git a/‎pgml-sdks/python/pgml/README.md
Lines changed: 38 additions & 0 deletions b/‎pgml-sdks/python/pgml/README.md
Lines changed: 38 additions & 0 deletions
diff --git a/‎pgml-sdks/python/pgml/examples/vector_search.ipynb
Lines changed: 236 additions & 0 deletions b/‎pgml-sdks/python/pgml/examples/vector_search.ipynb
Lines changed: 236 additions & 0 deletions
diff --git a/‎pgml-sdks/python/pgml/examples/vector_search.py
Lines changed: 34 additions & 0 deletions b/‎pgml-sdks/python/pgml/examples/vector_search.py
Lines changed: 34 additions & 0 deletions
diff --git a/‎pgml-sdks/python/pgml/pgml/__init__.py
Lines changed: 7 additions & 0 deletions b/‎pgml-sdks/python/pgml/pgml/__init__.py
Lines changed: 7 additions & 0 deletions
@@ -0,0 +1,38 @@
+# PostgresML Python SDK
+This Python SDK provides an easy interface to use PostgresML generative AI capabilities. 
+
+## Table of Contents
+
+- [Quickstart](#quickstart)
+
+### Quickstart
+1. Install Python 3.11. SDK should work for Python >=3.8. However, at this time, we have only tested Python 3.11.
+2. Clone the repository and checkout the SDK branch (before PR)
+```
+git clone https://github.com/postgresml/postgresml
+cd postgresml
+git checkout santi-pgml-memory-sdk-python
+cd pgml-sdks/python/pgml
+```
+3. Install poetry `pip install poetry`
+4. Initialize Python environment
+
+```
+poetry env use python3.11
+poetry shell
+poetry install
+poetry build
+```
+5. SDK uses your local PostgresML database by default 
+`postgres://postgres@127.0.0.1:5433/pgml_development`
+
+If it is not up to date with `pgml.embed` please [signup for a free database](https://postgresml.org/signup) and set `PGML_CONNECTION` environment variable with serverless hosted database.
+
+```
+export PGML_CONNECTION="postgres://<username>:<password>@<hostname>:<port>/pgm<database>"
+```
+6. Run a **vector search** example
+```
+python examples/vector_search.py
+```
+
@@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pgml import Database\n",
+    "import os\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "local_pgml = \"postgres://postgres@127.0.0.1:5433/pgml_development\"\n",
+    "\n",
+    "conninfo = os.environ.get(\"PGML_CONNECTION\",local_pgml)\n",
+    "db = Database(conninfo,min_connections=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection_name = \"test_pgml_sdk_1\"\n",
+    "collection = db.create_or_get_collection(collection_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "data = load_dataset(\"squad\", split=\"train\")\n",
+    "data = data.to_pandas()\n",
+    "data.head()\n",
+    "\n",
+    "data = data.drop_duplicates(subset=[\"context\"])\n",
+    "print(len(data))\n",
+    "data.head()\n",
+    "\n",
+    "documents = [\n",
+    "    {\n",
+    "        'text': r['context'],\n",
+    "        'metadata': {\n",
+    "            'title': r['title']\n",
+    "        }\n",
+    "    } for r in data.to_dict(orient='records')\n",
+    "]\n",
+    "documents[:3]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.upsert_documents(documents[0:200])\n",
+    "collection.generate_chunks()\n",
+    "collection.generate_embeddings()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2)\n",
+    "print(json.dumps(results,indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.register_model(model_name=\"paraphrase-MiniLM-L6-v2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.get_models()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(json.dumps(collection.get_models(),indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.generate_embeddings(model_id=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=2)\n",
+    "print(json.dumps(results,indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.register_model(model_name=\"hkunlp/instructor-xl\", model_params={\"instruction\": \"Represent the Wikipedia document for retrieval: \"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.get_models()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.generate_embeddings(model_id=3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, model_id=3, query_parameters={\"instruction\": \"Represent the Wikipedia question for retrieving supporting documents: \"})\n",
+    "print(json.dumps(results,indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.register_text_splitter(splitter_name=\"RecursiveCharacterTextSplitter\",splitter_params={\"chunk_size\": 100,\"chunk_overlap\": 20})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.generate_chunks(splitter_id=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection.generate_embeddings(splitter_id=2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = collection.vector_search(\"Who won 20 Grammy awards?\", top_k=2, splitter_id=2)\n",
+    "print(json.dumps(results,indent=2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db.delete_collection(collection_name)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pgml-zoggicR5-py3.11",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
@@ -0,0 +1,34 @@
+from pgml import Database
+import os
+import json
+from datasets import load_dataset
+from time import time
+from rich import print as rprint
+
+local_pgml = "postgres://postgres@127.0.0.1:5433/pgml_development"
+
+conninfo = os.environ.get("PGML_CONNECTION", local_pgml)
+db = Database(conninfo)
+
+collection_name = "test_pgml_sdk_1"
+collection = db.create_or_get_collection(collection_name)
+
+
+data = load_dataset("squad", split="train")
+data = data.to_pandas()
+data = data.drop_duplicates(subset=["context"])
+
+documents = [
+    {'id': r['id'], "text": r["context"], "title": r["title"]}
+    for r in data.to_dict(orient="records")
+]
+
+collection.upsert_documents(documents[:200])
+collection.generate_chunks()
+collection.generate_embeddings()
+
+start = time()
+results = collection.vector_search("Who won 20 grammy awards?", top_k=2)
+rprint(json.dumps(results, indent=2))
+rprint("Query time %0.3f"%(time()-start))
+db.archive_collection(collection_name)
@@ -0,0 +1,7 @@
+from .database import Database
+from .collection import Collection
+from .dbutils import (
+    run_create_or_insert_statement,
+    run_select_statement,
+    run_drop_or_delete_statement,
+)