support generating embeddings on the fly

2025-06-03 04:30:22 +00:00 · 2023-10-18 19:30:15 -07:00 · 2023-10-18 19:30:15 -07:00 · cf927b4e86
commit cf927b4e86
parent 924be62eea
5 changed files with 138 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -107,8 +107,10 @@ python main.py --human me.txt
  enables debugging output
 --archival_storage_faiss_path=<ARCHIVAL_STORAGE_FAISS_PATH>
  load in document database (backed by FAISS index)
--archival_storage_files="<ARCHIVAL_STORAGE_FILES_GLOB>"
+--archival_storage_files="<ARCHIVAL_STORAGE_FILES_GLOB_PATTERN>"
  pre-load files into archival memory
+--archival_storage_files_compute_embeddings="<ARCHIVAL_STORAGE_FILES_GLOB_PATTERN>"
+  pre-load files into archival memory and also compute embeddings for embedding search
 --archival_storage_sqldb=<SQLDB_PATH>
  load in SQL database
 ```
@ -181,6 +183,25 @@ To run our example where you can search over the SEC 10-K filings of Uber, Lyft,
    ```

 If you would like to load your own local files into MemGPT's archival memory, run the command above but replace `--archival_storage_files="memgpt/personas/examples/preload_archival/*.txt"` with your own file glob expression (enclosed in quotes).
+
+#### Enhance with embeddings search
+In the root `MemGPT` directory, run
+  ```bash
+  python3 main.py --archival_storage_files_compute_embeddings="<GLOB_PATTERN>" --persona=memgpt_doc --human=basic
+  ```
+
+This will generate embeddings, stick them into a FAISS index, and write the index to a directory, and then output:
+```
+  To avoid computing embeddings next time, replace --archival_storage_files_compute_embeddings=<GLOB_PATTERN> with
+    --archival_storage_faiss_path=<DIRECTORY_WITH_EMBEDDINGS> (if your files haven't changed).
+```
+
+If you want to reuse these embeddings, run 
+```bash
+python3 main.py --archival_storage_faiss_path="<DIRECTORY_WITH_EMBEDDINGS>" --persona=memgpt_doc --human=basic
+```
+
+
 </details>
 <details>
 <summary><h3>Talking to LlamaIndex API Docs</h3></summary>
--- a/interface.py
+++ b/interface.py
@ -10,6 +10,9 @@ init(autoreset=True)
 # DEBUG = True  # puts full message outputs in the terminal
 DEBUG = False  # only dumps important messages in the terminal

+def important_message(msg):
+    print(f'{Fore.MAGENTA}{Style.BRIGHT}{msg}{Style.RESET_ALL}')
+
 async def internal_monologue(msg):
    # ANSI escape code for italic is '\x1B[3m'
    print(f'\x1B[3m{Fore.LIGHTBLACK_EX}💭 {msg}{Style.RESET_ALL}')
--- a/main.py
+++ b/main.py
@ -27,6 +27,7 @@ flags.DEFINE_boolean("first", default=False, required=False, help="Use -first to
 flags.DEFINE_boolean("debug", default=False, required=False, help="Use -debug to enable debugging output")
 flags.DEFINE_string("archival_storage_faiss_path", default="", required=False, help="Specify archival storage with FAISS index to load (a folder with a .index and .json describing documents to be loaded)")
 flags.DEFINE_string("archival_storage_files", default="", required=False, help="Specify files to pre-load into archival memory (glob pattern)")
+flags.DEFINE_string("archival_storage_files_compute_embeddings", default="", required=False, help="Specify files to pre-load into archival memory (glob pattern), and compute embeddings over them")
 flags.DEFINE_string("archival_storage_sqldb", default="", required=False, help="Specify SQL database to pre-load into archival memory")


@ -54,6 +55,11 @@ async def main():
        archival_database = utils.prepare_archival_index_from_files(FLAGS.archival_storage_files)
        print(f"Preloaded {len(archival_database)} chunks into archival memory.")
        persistence_manager = InMemoryStateManagerWithPreloadedArchivalMemory(archival_database)
+    elif FLAGS.archival_storage_files_compute_embeddings:
+        faiss_save_dir = await utils.prepare_archival_index_from_files_compute_embeddings(FLAGS.archival_storage_files_compute_embeddings)
+        interface.important_message(f"To avoid computing embeddings next time, replace --archival_storage_files_compute_embeddings={FLAGS.archival_storage_files_compute_embeddings} with\n\t --archival_storage_faiss_path={faiss_save_dir} (if your files haven't changed).")
+        index, archival_database = utils.prepare_archival_index(faiss_save_dir)
+        persistence_manager = InMemoryStateManagerWithFaiss(index, archival_database)
    else:
        persistence_manager = InMemoryStateManager()
    memgpt_agent = presets.use_preset(presets.DEFAULT, FLAGS.model, personas.get_persona_text(FLAGS.persona), humans.get_human_text(FLAGS.human), interface, persistence_manager)
--- a/memgpt/utils.py
+++ b/memgpt/utils.py
@ -9,6 +9,8 @@ import faiss
 import tiktoken
 import glob
 import sqlite3
+from tqdm import tqdm
+from memgpt.openai_tools import async_get_embedding_with_backoff

 def count_tokens(s: str, model: str = "gpt-4") -> int:
    encoding = tiktoken.encoding_for_model(model)
@ -97,41 +99,54 @@ def read_in_chunks(file_object, chunk_size):
 def prepare_archival_index_from_files(glob_pattern, tkns_per_chunk=300, model='gpt-4'):
    encoding = tiktoken.encoding_for_model(model)
    files = glob.glob(glob_pattern)
+    return chunk_files(files, tkns_per_chunk, model)
+
+def total_bytes(pattern):
+    total = 0
+    for filename in glob.glob(pattern):
+        if os.path.isfile(filename):  # ensure it's a file and not a directory
+            total += os.path.getsize(filename)
+    return total
+
+def chunk_file(file, tkns_per_chunk=300, model='gpt-4'):
+    encoding = tiktoken.encoding_for_model(model)
+    with open(file, 'r') as f:
+        lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)]
+    curr_chunk = []
+    curr_token_ct = 0
+    for i, line in enumerate(lines):
+        line = line.rstrip()
+        line = line.lstrip()
+        try:
+            line_token_ct = len(encoding.encode(line))
+        except Exception as e:
+            line_token_ct = len(line.split(' ')) / .75
+            print(f"Could not encode line {i}, estimating it to be {line_token_ct} tokens") 
+            print(e)
+        if line_token_ct > tkns_per_chunk:
+            if len(curr_chunk) > 0:
+                yield ''.join(curr_chunk)
+                curr_chunk = []
+                curr_token_ct = 0
+            yield line[:3200]
+            continue
+        curr_token_ct += line_token_ct
+        curr_chunk.append(line)
+        if curr_token_ct > tkns_per_chunk:
+            yield ''.join(curr_chunk)
+            curr_chunk = []
+            curr_token_ct = 0
+
+    if len(curr_chunk) > 0:
+        yield ''.join(curr_chunk)
+
+def chunk_files(files, tkns_per_chunk=300, model='gpt-4'):
    archival_database = []
    for file in files:
        timestamp = os.path.getmtime(file)
        formatted_time = datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %I:%M:%S %p %Z%z")
-        with open(file, 'r') as f:
-            lines = [l for l in read_in_chunks(f, tkns_per_chunk*4)]
-        chunks = [] 
-        curr_chunk = []
-        curr_token_ct = 0
-        for line in lines:
-            line = line.rstrip()
-            line = line.lstrip()
-            try:
-                line_token_ct = len(encoding.encode(line))
-            except Exception as e:
-                line_token_ct = len(line.split(' ')) / .75
-                print(f"Could not encode line {line}, estimating it to be {line_token_ct} tokens") 
-            if line_token_ct > tkns_per_chunk:
-                if len(curr_chunk) > 0:
-                    chunks.append(''.join(curr_chunk))
-                    curr_chunk = []
-                    curr_token_ct = 0
-                chunks.append(line[:3200])
-                continue
-            curr_token_ct += line_token_ct
-            curr_chunk.append(line)
-            if curr_token_ct > tkns_per_chunk:
-                chunks.append(''.join(curr_chunk))
-                curr_chunk = []
-                curr_token_ct = 0
-
-        if len(curr_chunk) > 0:
-            chunks.append(''.join(curr_chunk))
-
        file_stem = file.split('/')[-1]
+        chunks = [c for c in chunk_file(file, tkns_per_chunk, model)]
        for i, chunk in enumerate(chunks):
            archival_database.append({
                'content': f"[File: {file_stem} Part {i}/{len(chunks)}] {chunk}",
@ -139,6 +154,67 @@ def prepare_archival_index_from_files(glob_pattern, tkns_per_chunk=300, model='g
            })
    return archival_database

+def chunk_files_for_jsonl(files, tkns_per_chunk=300, model='gpt-4'):
+    ret = []
+    for file in files:
+        file_stem = file.split('/')[-1]
+        curr_file = []
+        for chunk in chunk_file(file, tkns_per_chunk, model):
+            curr_file.append({
+                'title': file_stem,
+                'text': chunk,
+            })
+        ret.append(curr_file)
+    return ret
+
+async def prepare_archival_index_from_files_compute_embeddings(glob_pattern, tkns_per_chunk=300, model='gpt-4', embeddings_model='text-embedding-ada-002'):
+    files = sorted(glob.glob(glob_pattern))
+    save_dir = "archival_index_from_files_" + get_local_time().replace(' ', '_').replace(':', '_')
+    os.makedirs(save_dir, exist_ok=True)
+    total_tokens = total_bytes(glob_pattern) / 3
+    price_estimate = total_tokens / 1000 * .0001
+    confirm = input(f"Computing embeddings over {len(files)} files. This will cost ~${price_estimate:.2f}. Continue? [y/n] ")
+    if confirm != 'y':
+        raise Exception("embeddings were not computed")
+
+    # chunk the files, make embeddings
+    archival_database = chunk_files(files, tkns_per_chunk, model)
+    embedding_data = []
+    for chunk in tqdm(archival_database, desc="Processing file chunks", total=len(archival_database)):
+        # for chunk in tqdm(f, desc=f"Embedding file {i+1}/{len(chunks_by_file)}", total=len(f), leave=False):
+        try:
+            embedding = await async_get_embedding_with_backoff(chunk['content'], model=embeddings_model)
+        except Exception as e:
+            print(chunk)
+            raise e
+        embedding_data.append(embedding)
+    embeddings_file = os.path.join(save_dir, "embeddings.json")
+    with open(embeddings_file, 'w') as f:
+        print(f"Saving embeddings to {embeddings_file}")
+        json.dump(embedding_data, f)
+    
+    # make all_text.json
+    archival_storage_file = os.path.join(save_dir, "all_docs.jsonl")
+    chunks_by_file = chunk_files_for_jsonl(files, tkns_per_chunk, model)
+    with open(archival_storage_file, 'w') as f:
+        print(f"Saving archival storage with preloaded files to {archival_storage_file}")
+        for c in chunks_by_file:
+            json.dump(c, f)
+            f.write('\n')
+
+    # make the faiss index
+    index = faiss.IndexFlatL2(1536)
+    data = np.array(embedding_data).astype('float32')
+    try:
+        index.add(data)
+    except Exception as e:
+        print(data)
+        raise e
+    index_file = os.path.join(save_dir, "all_docs.index")
+    print(f"Saving faiss index {index_file}")
+    faiss.write_index(index, index_file)
+    return save_dir
+
 def read_database_as_list(database_name):
    result_list = [] 

--- a/requirements.txt
+++ b/requirements.txt
@ -11,3 +11,4 @@ pytz
 rich
 tiktoken
 timezonefinder
+tqdm