Source code for maeser.admin_portal.vector_store_operator

# SPDX-License-Identifier: LGPL-3.0-or-later

"""
This module is used to vectorize data from text files.
Output files will be saved in **output_dir** as "index.faiss" and "index.pkl".
This module can be executed in the terminal or used within another script.
"""

import os
import sys

from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from maeser.config import OPENAI_API_KEY as key

os.environ["OPENAI_API_KEY"] = key # Modify this line to open it from a cloud based file


# Load and combine all text from .txt files in the "output" directory
# This will provide one unified file per upload for training data, try to make it separate data for separate sources later on?
[docs] def vectorize_data(output_dir: str): """Vectorizes text files in **output_dir** and saves the resultant files as "index.faiss" and "index.pkl". Args: output_dir (str): The directory where the text files are located and where the vectorstore will be saved. """ # Read in all texts texts = [] metadatas = [] for filename in os.listdir(output_dir): if filename.endswith(".md"): source_name = os.path.splitext(filename)[0].replace("_", " ") with open(os.path.join(output_dir, filename), "r", encoding="utf-8") as f: texts.append(f.read()) metadatas.append({"source": source_name}) # Split all loaded texts into documents text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) documents = text_splitter.create_documents( texts=texts, metadatas=metadatas, ) # Save the vectorized text to a local FAISS vector store db = FAISS.from_documents(documents, OpenAIEmbeddings()) db.save_local(output_dir)
if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python vector_store_operator.py <directory>") sys.exit(1) output_dir = sys.argv[1] vectorize_data(output_dir)