anything-question-answering

Runtime error

App Files Files Community

LOUIS SANNA commited on Oct 24, 2023

Commit

35c9187

1 Parent(s): 6d2199d

feat(loggign)

Browse files

Files changed (4) hide show

.vscode/settings.json +3 -0
app.py +202 -358
climateqa/logging.py +70 -0
climateqa/vectorstore.py +0 -18

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "python.pythonPath": "/Users/louissanna/opt/anaconda3/envs/anything-question-answering/bin/python"
+}

app.py CHANGED Viewed

@@ -1,21 +1,16 @@
 import gradio as gr
-import pandas as pd
-import numpy as np
-import os
-from datetime import datetime
 from utils import create_user_id
-from azure.storage.fileshare import ShareServiceClient
 # Langchain
 from langchain.embeddings import HuggingFaceEmbeddings
-from langchain.schema import AIMessage, HumanMessage
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 # ClimateQ&A imports
 from climateqa.llm import get_llm
-from climateqa.chains import load_qa_chain_with_docs,load_qa_chain_with_text
 from climateqa.chains import load_reformulation_chain
 from climateqa.vectorstore import get_pinecone_vectorstore
 from climateqa.retriever import ClimateQARetriever
@@ -24,6 +19,7 @@ from climateqa.prompts import audience_prompts
 # Load environment variables in local mode
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except Exception as e:
     pass
@@ -36,7 +32,6 @@ theme = gr.themes.Base(
 )
 init_prompt = ""
 system_template = {
@@ -44,47 +39,40 @@ system_template = {
     "content": init_prompt,
 }
-account_key = os.environ["BLOB_ACCOUNT_KEY"]
-if len(account_key) == 86:
-    account_key += "=="
-credential = {
-    "account_key": account_key,
-    "account_name": os.environ["BLOB_ACCOUNT_NAME"],
-}
-account_url = os.environ["BLOB_ACCOUNT_URL"]
-file_share_name = "climategpt"
-service = ShareServiceClient(account_url=account_url, credential=credential)
-share_client = service.get_share_client(file_share_name)
 user_id = create_user_id()
-#---------------------------------------------------------------------------
 # ClimateQ&A core functions
-#---------------------------------------------------------------------------
 from langchain.callbacks.base import BaseCallbackHandler
 from queue import Queue, Empty
 from threading import Thread
 from collections.abc import Generator
 from langchain.schema import LLMResult
-from typing import Any, Union,Dict,List
 from queue import SimpleQueue
 # # Create a Queue
 # Q = Queue()
 import re
 def parse_output_llm_with_sources(output):
     # Split the content into a list of text and "[Doc X]" references
-    content_parts = re.split(r'\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]', output)
     parts = []
     for part in content_parts:
         if part.startswith("Doc"):
             subparts = part.split(",")
-            subparts = [subpart.lower().replace("doc","").strip() for subpart in subparts]
-            subparts = [f"<span class='doc-ref'><sup>{subpart}</sup></span>" for subpart in subparts]
             parts.append("".join(subparts))
         else:
             parts.append(part)
@@ -92,8 +80,7 @@ def parse_output_llm_with_sources(output):
     return content_parts
-job_done = object() # signals the processing is done
 class StreamingGradioCallbackHandler(BaseCallbackHandler):
@@ -125,45 +112,49 @@ class StreamingGradioCallbackHandler(BaseCallbackHandler):
         self.q.put(job_done)
 # Create embeddings function and LLM
-embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
-#---------------------------------------------------------------------------
 # ClimateQ&A Streaming
 # From https://github.com/gradio-app/gradio/issues/5345
 # And https://stackoverflow.com/questions/76057076/how-to-stream-agents-response-in-langchain
-#---------------------------------------------------------------------------
 from threading import Thread
-import json
-def answer_user(query,query_example,history):
     if len(query) <= 2:
         raise Exception("Please ask a longer question")
     return query, history + [[query, ". . ."]]
-def answer_user_example(query,query_example,history):
     return query_example, history + [[query_example, ". . ."]]
-def fetch_sources(query,sources):
     # Prepare default values
     if len(sources) == 0:
         sources = ["IPCC"]
-    llm_reformulation = get_llm(max_tokens = 512,temperature = 0.0,verbose = True,streaming = False)
-    retriever = ClimateQARetriever(vectorstore=vectorstore,sources = sources,k_summary = 3,k_total = 10)
     reformulation_chain = load_reformulation_chain(llm_reformulation)
     # Calculate language
-    output_reformulation = reformulation_chain({"query":query})
     question = output_reformulation["question"]
     language = output_reformulation["language"]
@@ -171,23 +162,23 @@ def fetch_sources(query,sources):
     docs = retriever.get_relevant_documents(question)
     if len(docs) > 0:
         # Already display the sources
         sources_text = []
         for i, d in enumerate(docs, 1):
             sources_text.append(make_html_source(d, i))
         citations_text = "".join(sources_text)
         docs_text = "\n\n".join([d.page_content for d in docs])
-        return "",citations_text,docs_text,question,language
     else:
-        sources_text = "⚠️ No relevant passages found in the scientific reports (IPCC and IPBES)"
         citations_text = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
         docs_text = ""
-        return "",citations_text,docs_text,question,language
-def answer_bot(query,history,docs,question,language,audience):
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
     elif audience == "General public":
@@ -200,36 +191,52 @@ def answer_bot(query,history,docs,question,language,audience):
     # Prepare Queue for streaming LLMs
     Q = SimpleQueue()
-    llm_streaming = get_llm(max_tokens = 1024,temperature = 0.0,verbose = True,streaming = True,
-        callbacks=[StreamingGradioCallbackHandler(Q),StreamingStdOutCallbackHandler()],
     )
     qa_chain = load_qa_chain_with_text(llm_streaming)
-    def threaded_chain(question,audience,language,docs):
         try:
-            response = qa_chain({"question":question,"audience":audience,"language":language,"summaries":docs})
             Q.put(response)
             Q.put(job_done)
         except Exception as e:
             print(e)
     history[-1][1] = ""
-    textbox=gr.Textbox(placeholder=". . .",show_label=False,scale=1,lines = 1,interactive = False)
     if len(docs) > 0:
         # Start thread for streaming
         thread = Thread(
-            target=threaded_chain,
-            kwargs={"question":question,"audience":audience_prompt,"language":language,"docs":docs}
         )
         thread.start()
         while True:
-            next_item = Q.get(block=True) # Blocks until an input is available
             if next_item is job_done:
                 break
@@ -237,88 +244,27 @@ def answer_bot(query,history,docs,question,language,audience):
                 new_paragraph = history[-1][1] + next_item
                 new_paragraph = parse_output_llm_with_sources(new_paragraph)
                 history[-1][1] = new_paragraph
-                yield textbox,history
             else:
                 pass
         thread.join()
-        # Log answer on Azure Blob Storage
-        timestamp = str(datetime.now().timestamp())
-        file = timestamp + ".json"
-        prompt = history[-1][0]
-        logs = {
-            "user_id": str(user_id),
-            "prompt": prompt,
-            "query": prompt,
-            "question":question,
-            "docs":docs,
-            "answer": history[-1][1],
-            "time": timestamp,
-        }
-        log_on_azure(file, logs, share_client)
     else:
         complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
         history[-1][1] += complete_response
-        yield "",history
-    # history_langchain_format = []
-    # for human, ai in history:
-    #     history_langchain_format.append(HumanMessage(content=human))
-    #     history_langchain_format.append(AIMessage(content=ai))
-    # history_langchain_format.append(HumanMessage(content=message)
-    # for next_token, content in stream(message):
-    #     yield(content)
-    # thread = Thread(target=threaded_chain, kwargs={"query":message,"audience":audience_prompt})
-    # thread.start()
-    # history[-1][1] = ""
-    # while True:
-    #     next_item = Q.get(block=True) # Blocks until an input is available
-    #     print(type(next_item))
-    #     if next_item is job_done:
-    #         continue
-    #     elif isinstance(next_item, dict):  # assuming LLMResult is a dictionary
-    #         response = next_item
-    #         if "source_documents" in response and len(response["source_documents"]) > 0:
-    #             sources_text = []
-    #             for i, d in enumerate(response["source_documents"], 1):
-    #                 sources_text.append(make_html_source(d, i))
-    #             sources_text = "\n\n".join([f"Query used for retrieval:\n{response['question']}"] + sources_text)
-    #             # history[-1][1] += next_item["answer"]
-    #             # history[-1][1] += "\n\n" + sources_text
-    #             yield "", history, sources_text
-    #         else:
-    #             sources_text = "⚠️ No relevant passages found in the scientific reports (IPCC and IPBES)"
-    #             complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
-    #             history[-1][1] += "\n\n" + complete_response
-    #             yield "", history, sources_text
-    #         break
-    #     elif isinstance(next_item, str):
-    #         new_paragraph = history[-1][1] + next_item
-    #         new_paragraph = parse_output_llm_with_sources(new_paragraph)
-    #         history[-1][1] = new_paragraph
-    #         yield "", history, ""
-    # thread.join()
-#---------------------------------------------------------------------------
 # ClimateQ&A core functions
-#---------------------------------------------------------------------------
-def make_html_source(source,i):
     meta = source.metadata
-    content = source.page_content.split(":",1)[1].strip()
     return f"""
 <div class="card">
     <div class="card-content">
@@ -335,144 +281,9 @@ def make_html_source(source,i):
 """
-# def chat(
-#     user_id: str,
-#     query: str,
-#     history: list = [system_template],
-#     report_type: str = "IPCC",
-#     threshold: float = 0.555,
-# ) -> tuple:
-#     """retrieve relevant documents in the document store then query gpt-turbo
-#     Args:
-#         query (str): user message.
-#         history (list, optional): history of the conversation. Defaults to [system_template].
-#         report_type (str, optional): should be "All available" or "IPCC only". Defaults to "All available".
-#         threshold (float, optional): similarity threshold, don't increase more than 0.568. Defaults to 0.56.
-#     Yields:
-#         tuple: chat gradio format, chat openai format, sources used.
-#     """
-#     if report_type not in ["IPCC","IPBES"]: report_type = "all"
-#     print("Searching in ",report_type," reports")
-#     # if report_type == "All available":
-#     #     retriever = retrieve_all
-#     # elif report_type == "IPCC only":
-#     #     retriever = retrieve_giec
-#     # else:
-#     #     raise Exception("report_type arg should be in (All available, IPCC only)")
-#     reformulated_query = openai.Completion.create(
-#         engine="EkiGPT",
-#         prompt=get_reformulation_prompt(query),
-#         temperature=0,
-#         max_tokens=128,
-#         stop=["\n---\n", "<|im_end|>"],
-#     )
-#     reformulated_query = reformulated_query["choices"][0]["text"]
-#     reformulated_query, language = reformulated_query.split("\n")
-#     language = language.split(":")[1].strip()
-#     sources = retrieve_with_summaries(reformulated_query,retriever,k_total = 10,k_summary = 3,as_dict = True,source = report_type.lower(),threshold = threshold)
-#     response_retriever = {
-#       "language":language,
-#       "reformulated_query":reformulated_query,
-#       "query":query,
-#       "sources":sources,
-#     }
-#     # docs = [d for d in retriever.retrieve(query=reformulated_query, top_k=10) if d.score > threshold]
-#     messages = history + [{"role": "user", "content": query}]
-#     if len(sources) > 0:
-#         docs_string = []
-#         docs_html = []
-#         for i, d in enumerate(sources, 1):
-#             docs_string.append(f"📃 Doc {i}: {d['meta']['short_name']} page {d['meta']['page_number']}\n{d['content']}")
-#             docs_html.append(make_html_source(d,i))
-#         docs_string = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_string)
-#         docs_html = "\n\n".join([f"Query used for retrieval:\n{reformulated_query}"] + docs_html)
-#         messages.append({"role": "system", "content": f"{sources_prompt}\n\n{docs_string}\n\nAnswer in {language}:"})
-#         response = openai.Completion.create(
-#             engine="EkiGPT",
-#             prompt=to_completion(messages),
-#             temperature=0,  # deterministic
-#             stream=True,
-#             max_tokens=1024,
-#         )
-#         complete_response = ""
-#         messages.pop()
-#         messages.append({"role": "assistant", "content": complete_response})
-#         timestamp = str(datetime.now().timestamp())
-#         file = user_id + timestamp + ".json"
-#         logs = {
-#             "user_id": user_id,
-#             "prompt": query,
-#             "retrived": sources,
-#             "report_type": report_type,
-#             "prompt_eng": messages[0],
-#             "answer": messages[-1]["content"],
-#             "time": timestamp,
-#         }
-#         log_on_azure(file, logs, share_client)
-#         for chunk in response:
-#             if (chunk_message := chunk["choices"][0].get("text")) and chunk_message != "<|im_end|>":
-#                 complete_response += chunk_message
-#                 messages[-1]["content"] = complete_response
-#                 gradio_format = make_pairs([a["content"] for a in messages[1:]])
-#                 yield gradio_format, messages, docs_html
-#     else:
-#         docs_string = "⚠️ No relevant passages found in the climate science reports (IPCC and IPBES)"
-#         complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate issues).**"
-#         messages.append({"role": "assistant", "content": complete_response})
-#         gradio_format = make_pairs([a["content"] for a in messages[1:]])
-#         yield gradio_format, messages, docs_string
-def save_feedback(feed: str, user_id):
-    if len(feed) > 1:
-        timestamp = str(datetime.now().timestamp())
-        file = user_id + timestamp + ".json"
-        logs = {
-            "user_id": user_id,
-            "feedback": feed,
-            "time": timestamp,
-        }
-        log_on_azure(file, logs, share_client)
-        return "Feedback submitted, thank you!"
 def reset_textbox():
     return gr.update(value="")
-import json
-def log_on_azure(file, logs, share_client):
-    logs = json.dumps(logs)
-    print(type(logs))
-    file_client = share_client.get_file_client(file)
-    print("Uploading logs to Azure Blob Storage")
-    print("----------------------------------")
-    print("")
-    print(logs)
-    file_client.upload_file(logs)
-    print("Logs uploaded to Azure Blob Storage")
-# def disable_component():
-#     return gr.update(interactive = False)
 # --------------------------------------------------------------------
 # Gradio
@@ -509,29 +320,33 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
     # user_id_state = gr.State([user_id])
     with gr.Tab("🌍 ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
                 # state = gr.State([system_template])
                 bot = gr.Chatbot(
-                    value=[[None,init_prompt]],
-                    show_copy_button=True,show_label = False,elem_id="chatbot",layout = "panel",avatar_images = ("assets/logo4.png",None))
-                # bot.like(vote,None,None)
-                with gr.Row(elem_id = "input-message"):
-                    textbox=gr.Textbox(placeholder="Ask me anything here!",show_label=False,scale=1,lines = 1,interactive = True)
                     # submit_button = gr.Button(">",scale = 1,elem_id = "submit-button")
-            with gr.Column(scale=1, variant="panel",elem_id = "right-panel"):
                 with gr.Tabs() as tabs:
-                    with gr.TabItem("📝 Examples",elem_id = "tab-examples",id = 0):
                         examples_hidden = gr.Textbox(elem_id="hidden-message")
                         examples_questions = gr.Examples(
@@ -575,14 +390,16 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
                             # cache_examples=True,
                         )
-                    with gr.Tab("📚 Citations",elem_id = "tab-citations",id = 1):
-                        sources_textbox = gr.HTML(show_label=False, elem_id="sources-textbox")
                         docs_textbox = gr.State("")
-                    with gr.Tab("⚙️ Configuration",elem_id = "tab-config",id = 2):
-                        gr.Markdown("Reminder: You can talk in any language, ClimateQ&A is multi-lingual!")
                         dropdown_sources = gr.CheckboxGroup(
                             ["IPCC", "IPBES"],
@@ -592,56 +409,106 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
                         )
                         dropdown_audience = gr.Dropdown(
-                            ["Children","General public","Experts"],
                             label="Select audience",
                             value="Experts",
                             interactive=True,
                         )
-                        output_query = gr.Textbox(label="Query used for retrieval",show_label = True,elem_id = "reformulated-query",lines = 2,interactive = False)
-                        output_language = gr.Textbox(label="Language",show_label = True,elem_id = "language",lines = 1,interactive = False)
                 # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
-                (textbox
-                    .submit(answer_user, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
-                    .success(change_tab,None,tabs)
-                    .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
-                    .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue = True)
-                    .success(lambda x : textbox,[textbox],[textbox])
                 )
-                (examples_hidden
-                    .change(answer_user_example, [textbox,examples_hidden, bot], [textbox, bot],queue = False)
-                    .success(change_tab,None,tabs)
-                    .success(fetch_sources,[textbox,dropdown_sources], [textbox,sources_textbox,docs_textbox,output_query,output_language])
-                    .success(answer_bot, [textbox,bot,docs_textbox,output_query,output_language,dropdown_audience], [textbox,bot],queue=True)
-                    .success(lambda x : textbox,[textbox],[textbox])
                 )
                 # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
                 #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
                 #     )
-#---------------------------------------------------------------------------------------
-# OTHER TABS
-#---------------------------------------------------------------------------------------
-    with gr.Tab("ℹ️ About ClimateQ&A",elem_classes = "max-height"):
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown(
@@ -660,7 +527,9 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
             with gr.Column(scale=1):
                 gr.Markdown("![](https://i.postimg.cc/fLvsvMzM/Untitled-design-5.png)")
-                gr.Markdown("*Source : IPCC AR6 - Synthesis Report of the IPCC 6th assessment report (AR6)*")
         gr.Markdown("## How to use ClimateQ&A")
         with gr.Row():
@@ -688,7 +557,6 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
         """
                 )
     with gr.Tab("📧 Contact, feedback and feature requests"):
         gr.Markdown(
             """
@@ -702,37 +570,10 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
         *This tool has been developed by the R&D lab at **Ekimetrics** (Jean Lelong, Nina Achache, Gabriel Olympie, Nicolas Chesneau, Natalia De la Calzada, Théo Alves Da Costa)*
         """
         )
-    # with gr.Row():
-    #     with gr.Column(scale=1):
-    #         gr.Markdown("### Feedbacks")
-    #         feedback = gr.Textbox(label="Write your feedback here")
-    #         feedback_output = gr.Textbox(label="Submit status")
-    #         feedback_save = gr.Button(value="submit feedback")
-    #         feedback_save.click(
-    #             save_feedback,
-    #             inputs=[feedback, user_id_state],
-    #             outputs=feedback_output,
-    #         )
-    #         gr.Markdown(
-    #             "If you need us to ask another climate science report or ask any question, contact us at <b>[email protected]</b>"
-    #         )
-    #     with gr.Column(scale=1):
-    #         gr.Markdown("### OpenAI API")
-    #         gr.Markdown(
-    #             "To make climate science accessible to a wider audience, we have opened our own OpenAI API key with a monthly cap of $1000. If you already have an API key, please use it to help conserve bandwidth for others."
-    #         )
-    #         openai_api_key_textbox = gr.Textbox(
-    #             placeholder="Paste your OpenAI API key (sk-...) and hit Enter",
-    #             show_label=False,
-    #             lines=1,
-    #             type="password",
-    #         )
-    # openai_api_key_textbox.change(set_openai_api_key, inputs=[openai_api_key_textbox])
-    # openai_api_key_textbox.submit(set_openai_api_key, inputs=[openai_api_key_textbox])
-    with gr.Tab("📚 Sources",elem_classes = "max-height"):
-        gr.Markdown("""
     | Source | Report | URL | Number of pages | Release date |
     | --- | --- | --- | --- | --- |
     IPCC | Summary for Policymakers. In: Climate Change 2021: The Physical Science Basis. Contribution of the WGI to the AR6 of the IPCC. | https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_SPM.pdf | 32 | 2021
@@ -770,10 +611,12 @@ with gr.Blocks(title="🌍 Climate Q&A", css="style.css", theme=theme) as demo:
     IPBES | Summary for Policymakers. Regional Assessment Report on Biodiversity and Ecosystem Services for Europe and Central Asia. | https://zenodo.org/record/3237468/files/ipbes_assessment_spm_eca_EN.pdf | 52 | 2018
     IPBES | Full Report. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 748 | 2018
     IPBES | Summary for Policymakers. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 48 | 2018
-""")
     with gr.Tab("🛢️ Carbon Footprint"):
-        gr.Markdown("""
 Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)
@@ -787,10 +630,11 @@ Carbon emissions were measured during the development and inference process usin
 Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to ClimateQ&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)
 Or around 2 to 4 times more than a typical Google search.
 """
-    )
     with gr.Tab("🪄 Changelog"):
-        gr.Markdown("""
 ##### v1.1.0 - *2023-10-16*
 - ClimateQ&A on Hugging Face is finally working again with all the new features !
@@ -805,7 +649,7 @@ Or around 2 to 4 times more than a typical Google search.
 - Add children mode on https://climateqa.com
 - Add follow-up questions https://climateqa.com
 """
-    )
     demo.queue(concurrency_count=16)

 import gradio as gr
 from utils import create_user_id
 # Langchain
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 # ClimateQ&A imports
 from climateqa.llm import get_llm
+from climateqa.logging import log
+from climateqa.chains import load_qa_chain_with_text
 from climateqa.chains import load_reformulation_chain
 from climateqa.vectorstore import get_pinecone_vectorstore
 from climateqa.retriever import ClimateQARetriever
 # Load environment variables in local mode
 try:
     from dotenv import load_dotenv
     load_dotenv()
 except Exception as e:
     pass
 )
 init_prompt = ""
 system_template = {
     "content": init_prompt,
 }
 user_id = create_user_id()
+# ---------------------------------------------------------------------------
 # ClimateQ&A core functions
+# ---------------------------------------------------------------------------
 from langchain.callbacks.base import BaseCallbackHandler
 from queue import Queue, Empty
 from threading import Thread
 from collections.abc import Generator
 from langchain.schema import LLMResult
+from typing import Any, Union, Dict, List
 from queue import SimpleQueue
 # # Create a Queue
 # Q = Queue()
 import re
 def parse_output_llm_with_sources(output):
     # Split the content into a list of text and "[Doc X]" references
+    content_parts = re.split(r"\[(Doc\s?\d+(?:,\s?Doc\s?\d+)*)\]", output)
     parts = []
     for part in content_parts:
         if part.startswith("Doc"):
             subparts = part.split(",")
+            subparts = [
+                subpart.lower().replace("doc", "").strip() for subpart in subparts
+            ]
+            subparts = [
+                f"<span class='doc-ref'><sup>{subpart}</sup></span>"
+                for subpart in subparts
+            ]
             parts.append("".join(subparts))
         else:
             parts.append(part)
     return content_parts
+job_done = object()  # signals the processing is done
 class StreamingGradioCallbackHandler(BaseCallbackHandler):
         self.q.put(job_done)
 # Create embeddings function and LLM
+embeddings_function = HuggingFaceEmbeddings(
+    model_name="sentence-transformers/multi-qa-mpnet-base-dot-v1"
+)
 # Create vectorstore and retriever
 vectorstore = get_pinecone_vectorstore(embeddings_function)
+# ---------------------------------------------------------------------------
 # ClimateQ&A Streaming
 # From https://github.com/gradio-app/gradio/issues/5345
 # And https://stackoverflow.com/questions/76057076/how-to-stream-agents-response-in-langchain
+# ---------------------------------------------------------------------------
 from threading import Thread
+def answer_user(query, query_example, history):
     if len(query) <= 2:
         raise Exception("Please ask a longer question")
     return query, history + [[query, ". . ."]]
+def answer_user_example(query, query_example, history):
     return query_example, history + [[query_example, ". . ."]]
+def fetch_sources(query, sources):
     # Prepare default values
     if len(sources) == 0:
         sources = ["IPCC"]
+    llm_reformulation = get_llm(
+        max_tokens=512, temperature=0.0, verbose=True, streaming=False
+    )
+    retriever = ClimateQARetriever(
+        vectorstore=vectorstore, sources=sources, k_summary=3, k_total=10
+    )
     reformulation_chain = load_reformulation_chain(llm_reformulation)
     # Calculate language
+    output_reformulation = reformulation_chain({"query": query})
     question = output_reformulation["question"]
     language = output_reformulation["language"]
     docs = retriever.get_relevant_documents(question)
     if len(docs) > 0:
         # Already display the sources
         sources_text = []
         for i, d in enumerate(docs, 1):
             sources_text.append(make_html_source(d, i))
         citations_text = "".join(sources_text)
         docs_text = "\n\n".join([d.page_content for d in docs])
+        return "", citations_text, docs_text, question, language
     else:
+        sources_text = (
+            "⚠️ No relevant passages found in the scientific reports (IPCC and IPBES)"
+        )
         citations_text = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
         docs_text = ""
+        return "", citations_text, docs_text, question, language
+def answer_bot(query, history, docs, question, language, audience):
     if audience == "Children":
         audience_prompt = audience_prompts["children"]
     elif audience == "General public":
     # Prepare Queue for streaming LLMs
     Q = SimpleQueue()
+    llm_streaming = get_llm(
+        max_tokens=1024,
+        temperature=0.0,
+        verbose=True,
+        streaming=True,
+        callbacks=[StreamingGradioCallbackHandler(Q), StreamingStdOutCallbackHandler()],
     )
     qa_chain = load_qa_chain_with_text(llm_streaming)
+    def threaded_chain(question, audience, language, docs):
         try:
+            response = qa_chain(
+                {
+                    "question": question,
+                    "audience": audience,
+                    "language": language,
+                    "summaries": docs,
+                }
+            )
             Q.put(response)
             Q.put(job_done)
         except Exception as e:
             print(e)
     history[-1][1] = ""
+    textbox = gr.Textbox(
+        placeholder=". . .", show_label=False, scale=1, lines=1, interactive=False
+    )
     if len(docs) > 0:
         # Start thread for streaming
         thread = Thread(
+            target=threaded_chain,
+            kwargs={
+                "question": question,
+                "audience": audience_prompt,
+                "language": language,
+                "docs": docs,
+            },
         )
         thread.start()
         while True:
+            next_item = Q.get(block=True)  # Blocks until an input is available
             if next_item is job_done:
                 break
                 new_paragraph = history[-1][1] + next_item
                 new_paragraph = parse_output_llm_with_sources(new_paragraph)
                 history[-1][1] = new_paragraph
+                yield textbox, history
             else:
                 pass
         thread.join()
+        log(question=question, history=history, docs=docs, user_id=user_id)
     else:
         complete_response = "**⚠️ No relevant passages found in the climate science reports (IPCC and IPBES), you may want to ask a more specific question (specifying your question on climate and biodiversity issues).**"
         history[-1][1] += complete_response
+        yield "", history
+# ---------------------------------------------------------------------------
 # ClimateQ&A core functions
+# ---------------------------------------------------------------------------
+def make_html_source(source, i):
     meta = source.metadata
+    content = source.page_content.split(":", 1)[1].strip()
     return f"""
 <div class="card">
     <div class="card-content">
 """
 def reset_textbox():
     return gr.update(value="")
 # --------------------------------------------------------------------
 # Gradio
     # user_id_state = gr.State([user_id])
     with gr.Tab("🌍 ClimateQ&A"):
         with gr.Row(elem_id="chatbot-row"):
             with gr.Column(scale=2):
                 # state = gr.State([system_template])
                 bot = gr.Chatbot(
+                    value=[[None, init_prompt]],
+                    show_copy_button=True,
+                    show_label=False,
+                    elem_id="chatbot",
+                    layout="panel",
+                    avatar_images=("assets/logo4.png", None),
+                )
+                # bot.like(vote,None,None)
+                with gr.Row(elem_id="input-message"):
+                    textbox = gr.Textbox(
+                        placeholder="Ask me anything here!",
+                        show_label=False,
+                        scale=1,
+                        lines=1,
+                        interactive=True,
+                    )
                     # submit_button = gr.Button(">",scale = 1,elem_id = "submit-button")
+            with gr.Column(scale=1, variant="panel", elem_id="right-panel"):
                 with gr.Tabs() as tabs:
+                    with gr.TabItem("📝 Examples", elem_id="tab-examples", id=0):
                         examples_hidden = gr.Textbox(elem_id="hidden-message")
                         examples_questions = gr.Examples(
                             # cache_examples=True,
                         )
+                    with gr.Tab("📚 Citations", elem_id="tab-citations", id=1):
+                        sources_textbox = gr.HTML(
+                            show_label=False, elem_id="sources-textbox"
+                        )
                         docs_textbox = gr.State("")
+                    with gr.Tab("⚙️ Configuration", elem_id="tab-config", id=2):
+                        gr.Markdown(
+                            "Reminder: You can talk in any language, ClimateQ&A is multi-lingual!"
+                        )
                         dropdown_sources = gr.CheckboxGroup(
                             ["IPCC", "IPBES"],
                         )
                         dropdown_audience = gr.Dropdown(
+                            ["Children", "General public", "Experts"],
                             label="Select audience",
                             value="Experts",
                             interactive=True,
                         )
+                        output_query = gr.Textbox(
+                            label="Query used for retrieval",
+                            show_label=True,
+                            elem_id="reformulated-query",
+                            lines=2,
+                            interactive=False,
+                        )
+                        output_language = gr.Textbox(
+                            label="Language",
+                            show_label=True,
+                            elem_id="language",
+                            lines=1,
+                            interactive=False,
+                        )
                 # textbox.submit(predict_climateqa,[textbox,bot],[None,bot,sources_textbox])
+                (
+                    textbox.submit(
+                        answer_user,
+                        [textbox, examples_hidden, bot],
+                        [textbox, bot],
+                        queue=False,
+                    )
+                    .success(change_tab, None, tabs)
+                    .success(
+                        fetch_sources,
+                        [textbox, dropdown_sources],
+                        [
+                            textbox,
+                            sources_textbox,
+                            docs_textbox,
+                            output_query,
+                            output_language,
+                        ],
+                    )
+                    .success(
+                        answer_bot,
+                        [
+                            textbox,
+                            bot,
+                            docs_textbox,
+                            output_query,
+                            output_language,
+                            dropdown_audience,
+                        ],
+                        [textbox, bot],
+                        queue=True,
+                    )
+                    .success(lambda x: textbox, [textbox], [textbox])
                 )
+                (
+                    examples_hidden.change(
+                        answer_user_example,
+                        [textbox, examples_hidden, bot],
+                        [textbox, bot],
+                        queue=False,
+                    )
+                    .success(change_tab, None, tabs)
+                    .success(
+                        fetch_sources,
+                        [textbox, dropdown_sources],
+                        [
+                            textbox,
+                            sources_textbox,
+                            docs_textbox,
+                            output_query,
+                            output_language,
+                        ],
+                    )
+                    .success(
+                        answer_bot,
+                        [
+                            textbox,
+                            bot,
+                            docs_textbox,
+                            output_query,
+                            output_language,
+                            dropdown_audience,
+                        ],
+                        [textbox, bot],
+                        queue=True,
+                    )
+                    .success(lambda x: textbox, [textbox], [textbox])
                 )
                 # submit_button.click(answer_user, [textbox, bot], [textbox, bot], queue=True).then(
                 #         answer_bot, [textbox,bot,dropdown_audience,dropdown_sources], [textbox,bot,sources_textbox]
                 #     )
+    # ---------------------------------------------------------------------------------------
+    # OTHER TABS
+    # ---------------------------------------------------------------------------------------
+    with gr.Tab("ℹ️ About ClimateQ&A", elem_classes="max-height"):
         with gr.Row():
             with gr.Column(scale=1):
                 gr.Markdown(
             with gr.Column(scale=1):
                 gr.Markdown("![](https://i.postimg.cc/fLvsvMzM/Untitled-design-5.png)")
+                gr.Markdown(
+                    "*Source : IPCC AR6 - Synthesis Report of the IPCC 6th assessment report (AR6)*"
+                )
         gr.Markdown("## How to use ClimateQ&A")
         with gr.Row():
         """
                 )
     with gr.Tab("📧 Contact, feedback and feature requests"):
         gr.Markdown(
             """
         *This tool has been developed by the R&D lab at **Ekimetrics** (Jean Lelong, Nina Achache, Gabriel Olympie, Nicolas Chesneau, Natalia De la Calzada, Théo Alves Da Costa)*
         """
         )
+    with gr.Tab("📚 Sources", elem_classes="max-height"):
+        gr.Markdown(
+            """
     | Source | Report | URL | Number of pages | Release date |
     | --- | --- | --- | --- | --- |
     IPCC | Summary for Policymakers. In: Climate Change 2021: The Physical Science Basis. Contribution of the WGI to the AR6 of the IPCC. | https://www.ipcc.ch/report/ar6/wg1/downloads/report/IPCC_AR6_WGI_SPM.pdf | 32 | 2021
     IPBES | Summary for Policymakers. Regional Assessment Report on Biodiversity and Ecosystem Services for Europe and Central Asia. | https://zenodo.org/record/3237468/files/ipbes_assessment_spm_eca_EN.pdf | 52 | 2018
     IPBES | Full Report. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 748 | 2018
     IPBES | Summary for Policymakers. Assessment Report on Land Degradation and Restoration. | https://zenodo.org/record/3237393/files/ipbes_assessment_report_ldra_EN.pdf | 48 | 2018
+"""
+        )
     with gr.Tab("🛢️ Carbon Footprint"):
+        gr.Markdown(
+            """
 Carbon emissions were measured during the development and inference process using CodeCarbon [https://github.com/mlco2/codecarbon](https://github.com/mlco2/codecarbon)
 Carbon Emissions are **relatively low but not negligible** compared to other usages: one question asked to ClimateQ&A is around 0.482gCO2e - equivalent to 2.2m by car (https://datagir.ademe.fr/apps/impact-co2/)
 Or around 2 to 4 times more than a typical Google search.
 """
+        )
     with gr.Tab("🪄 Changelog"):
+        gr.Markdown(
+            """
 ##### v1.1.0 - *2023-10-16*
 - ClimateQ&A on Hugging Face is finally working again with all the new features !
 - Add children mode on https://climateqa.com
 - Add follow-up questions https://climateqa.com
 """
+        )
     demo.queue(concurrency_count=16)

climateqa/logging.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import datetime
+import json
+import os
+from azure.storage.fileshare import ShareServiceClient
+def log(question, history, docs, user_id):
+    if has_blob_config():
+        log_in_azure(question, history, docs, user_id)
+    pass
+def has_blob_config():
+    """
+    Checks if the necessary environment variables for Azure Blob Storage are set.
+    Returns True if they are set, False otherwise.
+    """
+    return all(
+        key in os.environ
+        for key in ["BLOB_ACCOUNT_KEY", "BLOB_ACCOUNT_NAME", "BLOB_ACCOUNT_URL"]
+    )
+def log_in_azure(question, history, docs, user_id):
+    timestamp = str(datetime.now().timestamp())
+    file_name = timestamp + ".json"
+    prompt = history[-1][0]
+    logs = {
+        "user_id": str(user_id),
+        "prompt": prompt,
+        "query": prompt,
+        "question": question,
+        "docs": docs,
+        "answer": history[-1][1],
+        "time": timestamp,
+    }
+    upload_azure(file_name, logs)
+def get_azure_blob_client():
+    account_key = os.environ["BLOB_ACCOUNT_KEY"]
+    if len(account_key) == 86:
+        account_key += "=="
+    credential = {
+        "account_key": account_key,
+        "account_name": os.environ["BLOB_ACCOUNT_NAME"],
+    }
+    account_url = os.environ["BLOB_ACCOUNT_URL"]
+    file_share_name = "climategpt"
+    service = ShareServiceClient(account_url=account_url, credential=credential)
+    share_client = service.get_share_client(file_share_name)
+    return share_client
+if has_blob_config():
+    share_client = get_azure_blob_client()
+def upload_azure(file, logs):
+    logs = json.dumps(logs)
+    print(type(logs))
+    assert share_client is not None
+    file_client = share_client.get_file_client(file)
+    print("Uploading logs to Azure Blob Storage")
+    print("----------------------------------")
+    print("")
+    print(logs)
+    file_client.upload_file(logs)
+    print("Logs uploaded to Azure Blob Storage")

climateqa/vectorstore.py CHANGED Viewed

@@ -24,21 +24,3 @@ def get_pinecone_vectorstore(embeddings,text_key = "content"):
     index_name = os.getenv("PINECONE_API_INDEX")
     vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
     return vectorstore
-# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
-#     assert isinstance(sources,list)
-#     # Check if all elements in the list are either IPCC or IPBES
-#     filter = {
-#         "source": { "$in":sources},
-#     }
-#     retriever = vectorstore.as_retriever(search_kwargs={
-#         "k": k,
-#         "namespace":"vectors",
-#         "filter":filter
-#     })
-#     return retriever

     index_name = os.getenv("PINECONE_API_INDEX")
     vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
     return vectorstore