Source code for maeser.admin_portal.vector_store_operator
# SPDX-License-Identifier: LGPL-3.0-or-later
"""
This module is used to vectorize data from text files.
Output files will be saved in **output_dir** as "index.faiss" and "index.pkl".
This module can be executed in the terminal or used within another script.
"""
import os
import sys
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from maeser.config import OPENAI_API_KEY as key
os.environ["OPENAI_API_KEY"] = key # Modify this line to open it from a cloud based file
# Load and combine all text from .txt files in the "output" directory
# This will provide one unified file per upload for training data, try to make it separate data for separate sources later on?
[docs]
def vectorize_data(output_dir: str):
"""Vectorizes text files in **output_dir** and saves the resultant files as "index.faiss" and "index.pkl".
Args:
output_dir (str): The directory where the text files are located and where the vectorstore will be saved.
"""
# Read in all texts
texts = []
metadatas = []
for filename in os.listdir(output_dir):
if filename.endswith(".md"):
source_name = os.path.splitext(filename)[0].replace("_", " ")
with open(os.path.join(output_dir, filename), "r", encoding="utf-8") as f:
texts.append(f.read())
metadatas.append({"source": source_name})
# Split all loaded texts into documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.create_documents(
texts=texts,
metadatas=metadatas,
)
# Save the vectorized text to a local FAISS vector store
db = FAISS.from_documents(documents, OpenAIEmbeddings())
db.save_local(output_dir)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python vector_store_operator.py <directory>")
sys.exit(1)
output_dir = sys.argv[1]
vectorize_data(output_dir)