Update embeddings.md to use cosine distance in pgvector example (#716)

amir-qasemi · web-flow · commit e889f72bf59a · 2023-06-09T09:04:25.000-07:00
diff --git a/pgml-dashboard/static/docs/guides/transformers/embeddings.md b/pgml-dashboard/static/docs/guides/transformers/embeddings.md
@@ -63,28 +63,18 @@ ORDER BY similarity DESC
 LIMIT 50;
 ```
 
-```
-WITH query AS (
-    SELECT pgml.embed('sentence-transformers/all-MiniLM-L6-v2', 'Star Wars christmas special is on Disney') AS embedding
-)
-SELECT text, pgml.cosine_similarity(tweet_embeddings_2.embedding, query.embedding) AS similarity
-FROM tweet_embeddings_2, query
-ORDER BY similarity DESC
-LIMIT 50;
-```
 On small datasets (<100k rows), a linear search that compares every row to the query will give sub-second results, which may be fast enough for your use case. For larger datasets, you may want to consider various indexing strategies offered by additional extensions.
 
 - [Cube](https://www.postgresql.org/docs/current/cube.html) is a built-in extension that provides a fast indexing strategy for finding similar vectors. By default it has an arbitrary limit of 100 dimensions, unless Postgres is compiled with a larger size.
 - [PgVector](https://github.com/pgvector/pgvector) supports embeddings up to 2000 dimensions out of the box, and provides a fast indexing strategy for finding similar vectors.
 
 ```
 CREATE EXTENSION vector;
-CREATE TABLE items (text text, embedding vector(384));
-insert into items select text, embedding from tweet_embeddings_2;
+CREATE TABLE items (text TEXT, embedding VECTOR(768));
+INSERT INTO items SELECT text, embedding FROM tweet_embeddings;
+CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops);
 WITH query AS (
-    SELECT pgml.embed('sentence-transformers/all-MiniLM-L6-v2', 'Star Wars christmas special is on Disney')::vector AS embedding
+    SELECT pgml.embed('distilbert-base-uncased', 'Star Wars christmas special is on Disney')::vector AS embedding
 )
-SELECT * FROM items, query ORDER BY items.embedding <-> query.embedding LIMIT 10;
-
-CREATE INDEX ON tweet_embeddings_2 USING ivfflat (embedding vector_cosine_ops);
+SELECT * FROM items, query ORDER BY items.embedding <=> query.embedding LIMIT 10;
 ```