Spaces:

KakashiH
/

chatbot_inc

Sleeping

App Files Files Community

chatbot_inc / app.py

KakashiH

Update app.py

ccc0269 verified 8 months ago

raw

history blame contribute delete

8.17 kB

	import streamlit as st
	from langchain_community.document_loaders import PyPDFLoader
	from langchain.text_splitter import CharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings
	import faiss
	import numpy as np
	from transformers import pipeline
	import csv
	from groq import Groq
	import tempfile
	import os
	import re
	import textwrap

	# Replace with your Groq API key
	GROQ_API_KEY = "YOUR_GROQ_API_KEY"
	FAISS_INDEX_PATH = "faiss_index.index"
	METADATA_PATH = "metadata.csv"

	def process_pdfs(pdf_files):
	"""Processes uploaded PDF files and returns their processed text."""
	processed_texts = []
	for pdf_file in pdf_files:
	try:
	with tempfile.NamedTemporaryFile(delete=False) as temp_pdf:
	temp_pdf.write(pdf_file.read())
	temp_pdf_path = temp_pdf.name

	loader = PyPDFLoader(temp_pdf_path)
	documents = loader.load()
	text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=300, separator="\n\n")
	texts = text_splitter.split_documents(documents)
	page_contents = [doc.page_content for doc in texts]
	combined_text = "\n\n".join(page_contents)
	processed_texts.append(combined_text)
	os.remove(temp_pdf_path)
	except Exception as e:
	st.error(f"Error processing {pdf_file.name}: {str(e)}")
	processed_texts.append("")
	return processed_texts

	def clean_text(text):
	"""Enhanced text cleaning function."""
	if not text:
	return ""
	text = " ".join(text.split())
	text = re.sub(r'[^\x00-\x7F]+', '', text)
	text = re.sub(r'[\n\r\t]', ' ', text)
	text = re.sub(r'\s•\s', '. ', text)
	text = re.sub(r'\s–\s', '-', text)
	return text.strip()

	def summarize_text(text):
	"""Summarizes text using Groq's Gemma-7B-IT model."""
	if not text:
	return ""

	client = Groq(api_key=GROQ_API_KEY)

	max_input_length = 4096
	input_length = len(text.split())

	def get_summary(chunk):
	prompt = f"""
	[INST] Summarize the following text while preserving key details and clarity.

	Text: {chunk}

	Summary: [/INST]
	"""
	chat_completion = client.chat.completions.create(
	messages=[{"role": "user", "content": prompt}],
	model="gemma2-9b-it"
	)
	return chat_completion.choices[0].message.content.strip()

	if input_length > max_input_length:
	chunks = textwrap.wrap(text, width=max_input_length)
	summaries = [get_summary(chunk) for chunk in chunks]
	return " ".join(summaries)
	else:
	return get_summary(text)

	def load_embeddings_and_search(query, embedding_model, k=5):
	"""Loads embeddings from FAISS and performs a search."""
	index = faiss.read_index(FAISS_INDEX_PATH)
	query_embedding = embedding_model.embed_query(query)
	distances, indices = index.search(np.array([query_embedding], dtype=np.float32), k)

	results = []
	with open(METADATA_PATH, 'r', newline='', encoding='utf-8') as csvfile:
	reader = csv.reader(csvfile)
	next(reader)
	all_data = list(reader)
	for i in indices[0]:
	if i < len(all_data):
	results.append((all_data[i][0], all_data[i][1], distances[0][list(indices[0]).index(i)]))
	return results

	def generate_gemma_comparison(query, results, max_context_length=1500):
	"""Generates a comparison response using Gemma2-9b-it based on retrieved data, with length constraints."""
	pdf1_results = [result[1] for result in results if result[0] == 'pdf1']
	pdf2_results = [result[1] for result in results if result[0] == 'pdf2']

	def truncate_or_summarize(text_list):
	processed_texts = []
	for text in text_list:
	if len(text) > max_context_length:
	processed_texts.append(summarize_text(text[:max_context_length]))
	else:
	processed_texts.append(text)
	return processed_texts

	pdf1_results = truncate_or_summarize(pdf1_results)
	pdf2_results = truncate_or_summarize(pdf2_results)

	pdf1_context = "\n".join(pdf1_results) if pdf1_results else "No relevant information found in ICICI Lombard Health Insurance."
	pdf2_context = "\n".join(pdf2_results) if pdf2_results else "No relevant information found in HDFC Life Insurance."

	prompt = f"""
	[INST] You are an expert insurance comparison assistant. Given the following summaries from two health insurance policy documents, analyze and compare their coverage, exclusions, pricing, and additional benefits.

	ICICI Lombard Health Insurance Information:
	{pdf1_context}

	HDFC Life Insurance Information:
	{pdf2_context}

	User Question: {query}

	Step-by-Step Analysis (Chain of Thought Technique):
	1. Coverage Analysis: Identify the types of health coverage provided by each policy. Highlight key similarities and differences.
	2. Exclusions: List exclusions mentioned in each policy and compare their impact on policyholders.
	3. Pricing & Deductibles: If available, compare premium costs, deductibles, and additional fees.
	4. Flexibility & Additional Benefits: Check for added benefits like coverage for remote workers, international travel coverage, telemedicine, etc.
	5. Final Comparison: Summarize key differences and similarities and provide a well-structured conclusion.

	Based on this structured approach, generate a detailed comparative response.
	Comparison: [/INST]
	"""

	client = Groq(api_key=GROQ_API_KEY)
	chat_completion = client.chat.completions.create(messages=[{"role": "user", "content": prompt}], model="gemma2-9b-it")
	response = chat_completion.choices[0].message.content.strip()
	return response

	def main():
	st.title("Insurance Policy Comparison Chatbot")
	pdf_files = st.file_uploader("Upload two PDF files", type="pdf", accept_multiple_files=True)

	if pdf_files and len(pdf_files) == 2:
	if not os.path.exists(FAISS_INDEX_PATH) or not os.path.exists(METADATA_PATH):
	if st.button("Process PDFs and Start Chat"):
	with st.spinner("Processing PDFs and creating embeddings..."):
	pdf_texts = process_pdfs(pdf_files)
	text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

	doc_sources = []
	chunks = []

	for idx, text in enumerate(pdf_texts):
	if text:
	pdf_chunks = text_splitter.split_text(text)
	chunks.extend(pdf_chunks)
	doc_sources.extend([f"pdf{idx+1}"] * len(pdf_chunks))

	store_embeddings(chunks, embedding_model, doc_sources)
	st.session_state.pdf_names = [pdf_files[0].name, pdf_files[1].name]
	st.write("Chatbot is ready. Ask your questions!")
	else:
	st.session_state.pdf_names = [pdf_files[0].name, pdf_files[1].name]
	st.write("Chatbot is ready. Ask your questions!")

	if 'pdf_names' in st.session_state:
	embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
	query = st.chat_input("Ask a question about the insurance policies:")
	if query:
	retrieved_docs = load_embeddings_and_search(query, embedding_model)
	response = generate_gemma_comparison(query, retrieved_docs)
	st.write(f"Question: {query}")
	st.write(f"Answer: {response}")
	st.write(f"Policy 1: {st.session_state.pdf_names[0]} (ICICI Lombard)")
	st.write(f"Policy 2: {st.session_state.pdf_names[1]} (HDFC Life Insurance)")
	elif pdf_files:
	st.warning("Please upload exactly two PDF files.")
	else:
	st.info("Please upload two PDF files to start.")

	if __name__ == "__main__":
	main()