import re
import json

import streamlit
import spacy_streamlit
import spacy
from lxml import etree
import pandas as pd

streamlit.set_page_config(layout="wide")

samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}

with open('config.json', mode="r") as json_file:
    CONFIGURATION = json.loads(json_file.read())


# TITLE APP
streamlit.title("NER4Archives visualizer")
streamlit.sidebar.title("NER4Archives visualizer")
streamlit.sidebar.write("## Motivation")
streamlit.sidebar.markdown("""<div style="text-align: justify;">
<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>

<p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset 
extracted from XML EAD finding aids and test it on new data.<p>

<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a> 
framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>. 
Other models may be added in the future.</p>

<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>

NER4Archives - 2022</div>
""", unsafe_allow_html=True)

scol1, scol2 = streamlit.sidebar.columns(2)
scol1.image("./assets/an.png", width=170)
scol2.image("./assets/almanach_rouge-inria.png", width=100)

flag_file = False

# 1. User provides a XML EAD
streamlit.write("## 📄 Input XML EAD:")
filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
data = ""
flag_model = False

if filename is not None:
    data = filename.getvalue().decode("utf-8").encode("utf-8")
    if len(data) > 0:
        flag_file = True
def ead_strategy(tree):
    # create a container for sentences and dids
    # elements
    sentences = []
    container_dids = []
    # get the <dsc> level
    dsc = tree.xpath('.//dsc')
    for chlidren_dsc in dsc:
        # get <did> levels
        for did in chlidren_dsc.xpath('.//did'):
            container_dids.append(did)
            text = ""
            if did is not None:
                text += " ".join(
                    [did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
            # get the scopecontent if exists and concatenate with the rest
            if did.getnext() is not None:
                text += " ".join(
                    [" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
                     len(scopecontent) > 0])
            sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
    # assert len(sentences) == len(container_dids)
    return container_dids, sentences

model = ""
linking = True
flag_view = False
if flag_file:
    col1, col2 = streamlit.columns(2)
    col1.write("## 👁️ XML tree view:")
    col2.write("## 👁️ Plain text view:")
    parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
    tree = etree.fromstring(data, parser=parser)
    xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
    col1.text_area("", value=xml, height=500, disabled=True)
    dids, sentences = ead_strategy(tree)
    plain = "\n".join(sentences)
    col2.text_area("", value=plain, height=500, disabled=True)
    flag_view = True

if flag_view:
    streamlit.write("## ⚙️ Configure NER model and options:")
    models = []
    for pipe in spacy.info()["pipelines"]:
        models.append(pipe)
    option = streamlit.selectbox(
        'Choose a NER model you want to apply in the list: ',
        models)
    model = option
    if model != "":
        flag_model = True
    linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
    linkingicon = "✅️"
    if linking is False:
        linkingicon = "❌"
    streamlit.write("#### Actual Parameters:")
    streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')


entities = []
docs = []
ents = []
flag_vizualize = False

# Launch NER process:
if flag_model:
    if streamlit.button('Launch'):
        plain = "\n".join(sentences)
        with streamlit.spinner('Initialize NER...'):
            nlp = spacy.load(model)
            nlp.max_length = 5000000
            if linking:
                nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})

        with streamlit.spinner('NER processing...'):
            if linking:
                start_sentence = 0
                for doc in nlp.pipe(sentences, batch_size=250):
                    end_sentence = start_sentence + len(doc.text) + 1
                    for ent in doc.ents:
                        start_tok = start_sentence + ent.start_char
                        end_tok = start_tok + len(ent.text)
                        entities.append((
                            start_tok,
                            end_tok,
                            ent.text,
                            ent.label_,
                            ent._.kb_qid,
                            ent._.url_wikidata,
                            ent._.nerd_score
                        ))
                    start_sentence = end_sentence
            else:
                start_sentence = 0
                for doc in nlp.pipe(sentences):
                    end_sentence = start_sentence + len(doc.text) + 1
                    for ent in doc.ents:
                        start_tok = start_sentence + ent.start_char
                        end_tok = start_tok + len(ent.text)
                        entities.append((start_tok,
                                          end_tok,
                                          ent.text,
                                          ent.label_,
                                          "",
                                          "",
                                          ""
                                          ))
                    start_sentence = end_sentence


        streamlit.success('😃 NER applied with success!')


        df = pd.DataFrame(entities, columns=['START',
                                             'END',
                                             'MENTION',
                                             'NER LABEL',
                                             'QID',
                                             'WIKIDATA RESSOURCE  (wikidata disambiguation)',
                                             'LINKING SCORE'
                                             ])

        streamlit.write("## 🔎 Explore named entities in table: ")
        streamlit.write(df)


        streamlit.write("## 🔎 Explore named entities in text: ")
        spacy_streamlit.visualize_ner(
            {"text": plain,
             "ents": [{"start": ent[0],
                  "end": ent[1],
                  "label": ent[3],
                  "kb_id": ent[4] if linking else "",
                  "kb_url": ent[5] if linking else ""
                  } for ent in entities]},
            labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
            show_table=False,
            manual=True,
            title="",
            displacy_options={
                "colors": {
                 "EVENT": "#ec7063",
                 "LOCATION": "#45b39d",
                 "ORGANISATION": "#f39c12",
                 "PERSON": "#3498db",
                 "TITLE": "#a569bd ",
                 "LOC": "#45b39d",
                 "MISC": "#ec7063",
                 "ORG": "#f39c12",
                 "PER": "#3498db"
                }
            })