Spaces:
Runtime error
Runtime error
| import os | |
| import shutil | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.document_loaders import PyPDFLoader | |
| from .config import get_sources | |
| from .embeddings import EMBEDDING_MODEL_NAME | |
| from .vectorstore import PERSIST_DIRECTORY, get_vectorstore | |
| MIN_CHUNK_SIZE = 100 | |
| def load_data(): | |
| print("Loading data...") | |
| docs = parse_data() | |
| print("Documents loaded") | |
| embedding_function = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME) | |
| print("Building index...") | |
| vectorstore = get_vectorstore(embedding_function) | |
| assert isinstance(vectorstore, Chroma) | |
| vectorstore.from_documents( | |
| docs, embedding_function, persist_directory=PERSIST_DIRECTORY | |
| ) | |
| print("Index built") | |
| return vectorstore | |
| def parse_data(): | |
| docs = [] | |
| for source in get_sources(): | |
| file_path = source["file_path"] | |
| loader = PyPDFLoader(file_path) | |
| pages = loader.load_and_split() | |
| # split it into chunks | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| doc_chunks = text_splitter.split_documents(pages) | |
| for chunk in doc_chunks: | |
| if len(chunk.page_content) < MIN_CHUNK_SIZE: | |
| continue | |
| chunk.metadata["name"] = source["name"] | |
| chunk.metadata["domain"] = source["domain"] | |
| url = source.get("url", None) | |
| if url: | |
| chunk.metadata["url"] = source.get("url", None) | |
| chunk.metadata["page_number"] = chunk.metadata["page"] | |
| chunk.metadata["short_name"] = chunk.metadata["name"] | |
| docs.append(chunk) | |
| return docs | |
| def clear_index(): | |
| directory_path = PERSIST_DIRECTORY | |
| for filename in os.listdir(directory_path): | |
| file_path = os.path.join(directory_path, filename) | |
| try: | |
| print(f"Deleting {file_path}") | |
| if os.path.isfile(file_path) or os.path.islink(file_path): | |
| os.unlink(file_path) | |
| elif os.path.isdir(file_path): | |
| shutil.rmtree(file_path) | |
| except Exception as e: | |
| print(f"Failed to delete {file_path}. Reason: {e}") | |
| if __name__ == "__main__": | |
| clear_index() | |
| db = load_data() | |
| # query it | |
| query = ( | |
| "He who can bear the misfortune of a nation is called the ruler of the world." | |
| ) | |
| docs = db.similarity_search(query) | |
| print(docs) | |