Skip to content

Commit 4877f8b

Browse files
Add optional vectorizer bytes conversion and exponential backoff (#35)
This PR adds support for: - Exponential backoff for the openAI vectorizer - Optional `as_buffer` flag that can be passed to the various vectorizer embedding creation methods. Defaults to `False` always and assumes the user wants it back as a list of floats. But now this option is enabled.
1 parent 9de14f6 commit 4877f8b

File tree

6 files changed

+169
-41
lines changed

6 files changed

+169
-41
lines changed

docs/examples/openai_qna.ipynb

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -710,13 +710,11 @@
710710
"source": [
711711
"import os\n",
712712
"from redisvl.vectorize.text import OpenAITextVectorizer\n",
713-
"from redisvl.utils.utils import array_to_buffer\n",
714713
"\n",
715714
"api_key = os.environ.get(\"OPENAI_API_KEY\", \"\")\n",
716715
"oaip = OpenAITextVectorizer(EMBEDDINGS_MODEL, api_config={\"api_key\": api_key})\n",
717716
"\n",
718-
"chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist())\n",
719-
"chunked_data[\"embedding\"] = chunked_data[\"embedding\"].apply(lambda x: array_to_buffer(x))\n",
717+
"chunked_data[\"embedding\"] = oaip.embed_many(chunked_data[\"content\"].tolist(), as_buffer=True)\n",
720718
"chunked_data"
721719
]
722720
},

docs/user_guide/vectorizers_03.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@
105105
" \"Today is a sunny day\"\n",
106106
"]\n",
107107
"\n",
108-
"embeddings = hf.embed_many(sentences)\n"
108+
"embeddings = hf.embed_many(sentences, as_buffer=True)\n"
109109
]
110110
},
111111
{
@@ -183,7 +183,7 @@
183183
"# the vector is stored as a bytes buffer\n",
184184
"\n",
185185
"data = [{\"text\": t,\n",
186-
" \"embedding\": array_to_buffer(v)}\n",
186+
" \"embedding\": v}\n",
187187
" for t, v in zip(sentences, embeddings)]\n",
188188
"\n",
189189
"index.load(data)"

redisvl/vectorize/base.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from typing import Callable, Dict, List, Optional
22

3+
from redisvl.utils.utils import array_to_buffer
4+
35

46
class BaseVectorizer:
57
def __init__(self, model: str, dims: int, api_config: Optional[Dict] = None):
@@ -21,27 +23,35 @@ def set_model(self, model: str, dims: Optional[int] = None) -> None:
2123

2224
def embed_many(
2325
self,
24-
inputs: List[str],
26+
texts: List[str],
2527
preprocess: Optional[Callable] = None,
26-
chunk_size: int = 1000,
28+
batch_size: Optional[int] = 1000,
29+
as_buffer: Optional[bool] = False,
2730
) -> List[List[float]]:
2831
raise NotImplementedError
2932

3033
def embed(
31-
self, emb_input: str, preprocess: Optional[Callable] = None
34+
self,
35+
text: str,
36+
preprocess: Optional[Callable] = None,
37+
as_buffer: Optional[bool] = False,
3238
) -> List[float]:
3339
raise NotImplementedError
3440

3541
async def aembed_many(
3642
self,
37-
inputs: List[str],
43+
texts: List[str],
3844
preprocess: Optional[Callable] = None,
39-
chunk_size: int = 1000,
45+
batch_size: Optional[int] = 1000,
46+
as_buffer: Optional[bool] = False,
4047
) -> List[List[float]]:
4148
raise NotImplementedError
4249

4350
async def aembed(
44-
self, emb_input: str, preprocess: Optional[Callable] = None
51+
self,
52+
text: str,
53+
preprocess: Optional[Callable] = None,
54+
as_buffer: Optional[bool] = False,
4555
) -> List[float]:
4656
raise NotImplementedError
4757

@@ -51,3 +61,8 @@ def batchify(self, seq: list, size: int, preprocess: Optional[Callable] = None):
5161
yield [preprocess(chunk) for chunk in seq[pos : pos + size]]
5262
else:
5363
yield seq[pos : pos + size]
64+
65+
def _process_embedding(self, embedding: List[float], as_buffer: bool):
66+
if as_buffer:
67+
return array_to_buffer(embedding)
68+
return embedding

redisvl/vectorize/text/huggingface.py

Lines changed: 46 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55

66
class HFTextVectorizer(BaseVectorizer):
7+
# TODO - add docstring
78
def __init__(self, model: str, api_config: Optional[Dict] = None):
89
# TODO set dims based on model
910
dims = 768
@@ -18,21 +19,57 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
1819
self._model_client = SentenceTransformer(model)
1920

2021
def embed(
21-
self, emb_input: str, preprocess: Optional[Callable] = None
22+
self,
23+
text: str,
24+
preprocess: Optional[Callable] = None,
25+
as_buffer: Optional[float] = False,
2226
) -> List[float]:
27+
"""Embed a chunk of text using the Hugging Face sentence transformer.
28+
29+
Args:
30+
text (str): Chunk of text to embed.
31+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
32+
perform before vectorization. Defaults to None.
33+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
34+
to a byte string. Defaults to False.
35+
36+
Returns:
37+
List[float]: Embedding.
38+
"""
2339
if preprocess:
24-
emb_input = preprocess(emb_input)
25-
embedding = self._model_client.encode([emb_input])[0]
26-
return embedding.tolist()
40+
text = preprocess(text)
41+
embedding = self._model_client.encode([text])[0]
42+
return self._process_embedding(embedding.tolist(), as_buffer)
2743

2844
def embed_many(
2945
self,
30-
inputs: List[str],
46+
texts: List[str],
3147
preprocess: Optional[Callable] = None,
32-
chunk_size: int = 1000,
48+
batch_size: int = 1000,
49+
as_buffer: Optional[float] = None,
3350
) -> List[List[float]]:
34-
embeddings = []
35-
for batch in self.batchify(inputs, chunk_size, preprocess):
51+
"""Asynchronously embed many chunks of texts using the Hugging Face sentence
52+
transformer.
53+
54+
Args:
55+
texts (List[str]): List of text chunks to embed.
56+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
57+
perform before vectorization. Defaults to None.
58+
batch_size (int, optional): Batch size of texts to use when creating
59+
embeddings. Defaults to 10.
60+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
61+
to a byte string. Defaults to False.
62+
63+
Returns:
64+
List[List[float]]: List of embeddings.
65+
"""
66+
embeddings: List = []
67+
for batch in self.batchify(texts, batch_size, preprocess):
3668
batch_embeddings = self._model_client.encode(batch)
37-
embeddings.extend([embedding.tolist() for embedding in batch_embeddings])
69+
embeddings.extend(
70+
[
71+
self._process_embedding(embedding.tolist(), as_buffer)
72+
for embedding in batch_embeddings
73+
]
74+
)
3875
return embeddings

redisvl/vectorize/text/openai.py

Lines changed: 97 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
11
from typing import Callable, Dict, List, Optional
22

3+
from tenacity import ( # for exponential backoff
4+
retry,
5+
stop_after_attempt,
6+
wait_random_exponential,
7+
)
8+
39
from redisvl.vectorize.base import BaseVectorizer
410

511

612
class OpenAITextVectorizer(BaseVectorizer):
13+
# TODO - add docstring
714
def __init__(self, model: str, api_config: Optional[Dict] = None):
815
dims = 1536
916
super().__init__(model, dims, api_config)
@@ -18,42 +25,112 @@ def __init__(self, model: str, api_config: Optional[Dict] = None):
1825
openai.api_key = api_config.get("api_key", None)
1926
self._model_client = openai.Embedding
2027

28+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
2129
def embed_many(
2230
self,
23-
inputs: List[str],
31+
texts: List[str],
2432
preprocess: Optional[Callable] = None,
25-
chunk_size: int = 1000,
33+
batch_size: Optional[int] = 10,
34+
as_buffer: Optional[float] = False,
2635
) -> List[List[float]]:
27-
results = []
28-
for batch in self.batchify(inputs, chunk_size, preprocess):
36+
"""Embed many chunks of texts using the OpenAI API.
37+
38+
Args:
39+
texts (List[str]): List of text chunks to embed.
40+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
41+
perform before vectorization. Defaults to None.
42+
batch_size (int, optional): Batch size of texts to use when creating
43+
embeddings. Defaults to 10.
44+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
45+
to a byte string. Defaults to False.
46+
47+
Returns:
48+
List[List[float]]: List of embeddings.
49+
"""
50+
embeddings: List = []
51+
for batch in self.batchify(texts, batch_size, preprocess):
2952
response = self._model_client.create(input=batch, engine=self._model)
30-
results += [r["embedding"] for r in response["data"]]
31-
return results
53+
embeddings += [
54+
self._process_embedding(r["embedding"], as_buffer)
55+
for r in response["data"]
56+
]
57+
return embeddings
3258

59+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
3360
def embed(
34-
self, emb_input: str, preprocess: Optional[Callable] = None
61+
self,
62+
text: str,
63+
preprocess: Optional[Callable] = None,
64+
as_buffer: Optional[float] = False,
3565
) -> List[float]:
66+
"""Embed a chunk of text using the OpenAI API.
67+
68+
Args:
69+
text (str): Chunk of text to embed.
70+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
71+
perform before vectorization. Defaults to None.
72+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
73+
to a byte string. Defaults to False.
74+
75+
Returns:
76+
List[float]: Embedding.
77+
"""
3678
if preprocess:
37-
emb_input = preprocess(emb_input)
38-
result = self._model_client.create(input=[emb_input], engine=self._model)
39-
return result["data"][0]["embedding"]
79+
text = preprocess(text)
80+
result = self._model_client.create(input=[text], engine=self._model)
81+
return self._process_embedding(result["data"][0]["embedding"], as_buffer)
4082

83+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
4184
async def aembed_many(
4285
self,
43-
inputs: List[str],
86+
texts: List[str],
4487
preprocess: Optional[Callable] = None,
45-
chunk_size: int = 1000,
88+
batch_size: int = 1000,
89+
as_buffer: Optional[bool] = False,
4690
) -> List[List[float]]:
47-
results = []
48-
for batch in self.batchify(inputs, chunk_size, preprocess):
91+
"""Asynchronously embed many chunks of texts using the OpenAI API.
92+
93+
Args:
94+
texts (List[str]): List of text chunks to embed.
95+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
96+
perform before vectorization. Defaults to None.
97+
batch_size (int, optional): Batch size of texts to use when creating
98+
embeddings. Defaults to 10.
99+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
100+
to a byte string. Defaults to False.
101+
102+
Returns:
103+
List[List[float]]: List of embeddings.
104+
"""
105+
embeddings: List = []
106+
for batch in self.batchify(texts, batch_size, preprocess):
49107
response = await self._model_client.acreate(input=batch, engine=self._model)
50-
results += [r["embedding"] for r in response["data"]]
51-
return results
108+
embeddings += [
109+
self._process_embedding(r["embedding"], as_buffer)
110+
for r in response["data"]
111+
]
112+
return embeddings
52113

114+
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
53115
async def aembed(
54-
self, emb_input: str, preprocess: Optional[Callable] = None
116+
self,
117+
text: str,
118+
preprocess: Optional[Callable] = None,
119+
as_buffer: Optional[bool] = False,
55120
) -> List[float]:
121+
"""Asynchronously embed a chunk of text using the OpenAI API.
122+
123+
Args:
124+
text (str): Chunk of text to embed.
125+
preprocess (Optional[Callable], optional): Optional preprocessing callable to
126+
perform before vectorization. Defaults to None.
127+
as_buffer (Optional[float], optional): Whether to convert the raw embedding
128+
to a byte string. Defaults to False.
129+
130+
Returns:
131+
List[float]: Embedding.
132+
"""
56133
if preprocess:
57-
emb_input = preprocess(emb_input)
58-
result = await self._model_client.acreate(input=[emb_input], engine=self._model)
59-
return result["data"][0]["embedding"]
134+
text = preprocess(text)
135+
result = await self._model_client.acreate(input=[text], engine=self._model)
136+
return self._process_embedding(result["data"][0]["embedding"], as_buffer)

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@ numpy
22
redis>=4.3.4
33
pyyaml
44
coloredlogs
5-
pydantic>=2.0.0
5+
pydantic>=2.0.0
6+
tenacity==8.2.2

0 commit comments

Comments
 (0)