Spaces:
Runtime error
Runtime error
LOUIS SANNA
commited on
Commit
·
d98ba57
1
Parent(s):
cc2ce8c
feat(data): add other pdfs
Browse files- chroma_db/13934663-2db5-404d-be0f-51734d442e08/data_level0.bin +3 -0
- chroma_db/13934663-2db5-404d-be0f-51734d442e08/header.bin +3 -0
- chroma_db/13934663-2db5-404d-be0f-51734d442e08/length.bin +3 -0
- chroma_db/13934663-2db5-404d-be0f-51734d442e08/link_lists.bin +0 -0
- chroma_db/chroma.sqlite3 +2 -2
- climateqa/build_index.py +37 -15
- climateqa/qa_logging.py +2 -0
- climateqa/vectorstore.py +3 -1
chroma_db/13934663-2db5-404d-be0f-51734d442e08/data_level0.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a13e72541800c513c73dccea69f79e39cf4baef4fa23f7e117c0d6b0f5f99670
|
| 3 |
+
size 3212000
|
chroma_db/13934663-2db5-404d-be0f-51734d442e08/header.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0ec6df10978b056a10062ed99efeef2702fa4a1301fad702b53dd2517103c746
|
| 3 |
+
size 100
|
chroma_db/13934663-2db5-404d-be0f-51734d442e08/length.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
|
| 3 |
+
size 4000
|
chroma_db/13934663-2db5-404d-be0f-51734d442e08/link_lists.bin
ADDED
|
File without changes
|
chroma_db/chroma.sqlite3
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4dc2c64a9de7507097ab452fdce23fc6348f38e0d34484d791a8c43366b78001
|
| 3 |
+
size 2564096
|
climateqa/build_index.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
from langchain.vectorstores import Chroma
|
| 4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 5 |
from langchain.document_loaders import PyPDFLoader
|
| 6 |
|
| 7 |
from .embeddings import EMBEDDING_MODEL_NAME
|
| 8 |
-
from .vectorstore import get_vectorstore
|
| 9 |
|
| 10 |
|
| 11 |
def load_data():
|
|
@@ -15,24 +16,33 @@ def load_data():
|
|
| 15 |
|
| 16 |
assert isinstance(vectorstore, Chroma)
|
| 17 |
vectorstore.from_documents(
|
| 18 |
-
docs, embedding_function, persist_directory=
|
| 19 |
)
|
| 20 |
return vectorstore
|
| 21 |
|
| 22 |
|
| 23 |
def parse_data():
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
# split it into chunks
|
| 28 |
-
text_splitter = RecursiveCharacterTextSplitter(chunk_size=250, chunk_overlap=0)
|
| 29 |
-
docs = text_splitter.split_documents(pages)
|
| 30 |
-
print(docs)
|
| 31 |
-
for doc in docs:
|
| 32 |
-
doc.metadata["name"] = parse_name(doc.metadata["source"])
|
| 33 |
-
doc.metadata["domain"] = parse_domain(doc.metadata["source"])
|
| 34 |
-
doc.metadata["page_number"] = doc.metadata["page"]
|
| 35 |
-
doc.metadata["short_name"] = doc.metadata["name"]
|
| 36 |
return docs
|
| 37 |
|
| 38 |
|
|
@@ -41,10 +51,22 @@ def parse_name(source: str) -> str:
|
|
| 41 |
|
| 42 |
|
| 43 |
def parse_domain(source: str) -> str:
|
| 44 |
-
return source.split("/")[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
|
| 47 |
if __name__ == "__main__":
|
|
|
|
| 48 |
db = load_data()
|
| 49 |
# query it
|
| 50 |
query = (
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain.vectorstores import Chroma
|
| 5 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 6 |
from langchain.document_loaders import PyPDFLoader
|
| 7 |
|
| 8 |
from .embeddings import EMBEDDING_MODEL_NAME
|
| 9 |
+
from .vectorstore import PERSIST_DIRECTORY, get_vectorstore
|
| 10 |
|
| 11 |
|
| 12 |
def load_data():
|
|
|
|
| 16 |
|
| 17 |
assert isinstance(vectorstore, Chroma)
|
| 18 |
vectorstore.from_documents(
|
| 19 |
+
docs, embedding_function, persist_directory=PERSIST_DIRECTORY
|
| 20 |
)
|
| 21 |
return vectorstore
|
| 22 |
|
| 23 |
|
| 24 |
def parse_data():
|
| 25 |
+
docs = []
|
| 26 |
+
for root, dirs, files in os.walk("data"):
|
| 27 |
+
for file in files:
|
| 28 |
+
if file.endswith(".pdf"):
|
| 29 |
+
file_path = os.path.join(root, file)
|
| 30 |
+
loader = PyPDFLoader(file_path)
|
| 31 |
+
pages = loader.load_and_split()
|
| 32 |
+
|
| 33 |
+
# split it into chunks
|
| 34 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 35 |
+
chunk_size=1000, chunk_overlap=0
|
| 36 |
+
)
|
| 37 |
+
doc_chunks = text_splitter.split_documents(pages)
|
| 38 |
+
|
| 39 |
+
for chunk in doc_chunks:
|
| 40 |
+
chunk.metadata["name"] = parse_name(chunk.metadata["source"])
|
| 41 |
+
chunk.metadata["domain"] = parse_domain(chunk.metadata["source"])
|
| 42 |
+
chunk.metadata["page_number"] = chunk.metadata["page"]
|
| 43 |
+
chunk.metadata["short_name"] = chunk.metadata["name"]
|
| 44 |
+
docs.append(chunk)
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
return docs
|
| 47 |
|
| 48 |
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def parse_domain(source: str) -> str:
|
| 54 |
+
return source.split("/")[1]
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def clear_index():
|
| 58 |
+
folder = PERSIST_DIRECTORY
|
| 59 |
+
for filename in os.listdir(folder):
|
| 60 |
+
file_path = os.path.join(folder, filename)
|
| 61 |
+
try:
|
| 62 |
+
if os.path.isfile(file_path) or os.path.islink(file_path):
|
| 63 |
+
os.unlink(file_path)
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print("Failed to delete %s. Reason: %s" % (file_path, e))
|
| 66 |
|
| 67 |
|
| 68 |
if __name__ == "__main__":
|
| 69 |
+
clear_index()
|
| 70 |
db = load_data()
|
| 71 |
# query it
|
| 72 |
query = (
|
climateqa/qa_logging.py
CHANGED
|
@@ -2,6 +2,7 @@ import datetime
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
|
|
|
|
| 5 |
def log(question, history, docs, user_id):
|
| 6 |
if has_blob_config():
|
| 7 |
log_in_azure(question, history, docs, user_id)
|
|
@@ -48,6 +49,7 @@ def get_azure_blob_client():
|
|
| 48 |
file_share_name = "climategpt"
|
| 49 |
# I don't know why this is necessary, but it cause an error otherwise when running build_index.py
|
| 50 |
from azure.storage.fileshare import ShareServiceClient
|
|
|
|
| 51 |
service = ShareServiceClient(account_url=account_url, credential=credential)
|
| 52 |
share_client = service.get_share_client(file_share_name)
|
| 53 |
return share_client
|
|
|
|
| 2 |
import json
|
| 3 |
import os
|
| 4 |
|
| 5 |
+
|
| 6 |
def log(question, history, docs, user_id):
|
| 7 |
if has_blob_config():
|
| 8 |
log_in_azure(question, history, docs, user_id)
|
|
|
|
| 49 |
file_share_name = "climategpt"
|
| 50 |
# I don't know why this is necessary, but it cause an error otherwise when running build_index.py
|
| 51 |
from azure.storage.fileshare import ShareServiceClient
|
| 52 |
+
|
| 53 |
service = ShareServiceClient(account_url=account_url, credential=credential)
|
| 54 |
share_client = service.get_share_client(file_share_name)
|
| 55 |
return share_client
|
climateqa/vectorstore.py
CHANGED
|
@@ -5,6 +5,8 @@ import os
|
|
| 5 |
import pinecone
|
| 6 |
from langchain.vectorstores import Chroma, Pinecone
|
| 7 |
|
|
|
|
|
|
|
| 8 |
try:
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
|
|
@@ -21,7 +23,7 @@ def get_vectorstore(embeddings_function):
|
|
| 21 |
|
| 22 |
def get_chroma_vectore_store(embedding_function):
|
| 23 |
return Chroma(
|
| 24 |
-
persist_directory=
|
| 25 |
)
|
| 26 |
|
| 27 |
|
|
|
|
| 5 |
import pinecone
|
| 6 |
from langchain.vectorstores import Chroma, Pinecone
|
| 7 |
|
| 8 |
+
PERSIST_DIRECTORY = "./chroma_db"
|
| 9 |
+
|
| 10 |
try:
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
|
|
|
|
| 23 |
|
| 24 |
def get_chroma_vectore_store(embedding_function):
|
| 25 |
return Chroma(
|
| 26 |
+
persist_directory=PERSIST_DIRECTORY, embedding_function=embedding_function
|
| 27 |
)
|
| 28 |
|
| 29 |
|