import re import json import streamlit import spacy_streamlit import spacy from lxml import etree import pandas as pd streamlit.set_page_config(layout="wide") samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"} with open('config.json', mode="r") as json_file: CONFIGURATION = json.loads(json_file.read()) # TITLE APP streamlit.title("NER4Archives visualizer") streamlit.sidebar.title("NER4Archives visualizer") streamlit.sidebar.write("## Motivation") streamlit.sidebar.markdown("""

This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on XML EAD finding aids and evaluate NER predictions.

In the context of the NER4Archives project (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset extracted from XML EAD finding aids and test it on new data.

Most of the models available here are trained with the NLP spaCy framework and are available on the HF organisation hub. Other models may be added in the future.

The project also includes a downstream entity linking task. The SpaCy fishing extension (based on entity-fishing) is used here to support this purpose.

NER4Archives - 2022
""", unsafe_allow_html=True) scol1, scol2 = streamlit.sidebar.columns(2) scol1.image("./assets/an.png", width=170) scol2.image("./assets/almanach_rouge-inria.png", width=100) flag_file = False # 1. User provides a XML EAD streamlit.write("## 📄 Input XML EAD:") filename = streamlit.file_uploader("Upload an XML EAD", type="xml") streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory") data = "" flag_model = False if filename is not None: data = filename.getvalue().decode("utf-8").encode("utf-8") if len(data) > 0: flag_file = True def ead_strategy(tree): # create a container for sentences and dids # elements sentences = [] container_dids = [] # get the level dsc = tree.xpath('.//dsc') for chlidren_dsc in dsc: # get levels for did in chlidren_dsc.xpath('.//did'): container_dids.append(did) text = "" if did is not None: text += " ".join( [did_content.strip() for did_content in did.itertext() if len(did_content) > 0]) # get the scopecontent if exists and concatenate with the rest if did.getnext() is not None: text += " ".join( [" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if len(scopecontent) > 0]) sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ") # assert len(sentences) == len(container_dids) return container_dids, sentences model = "" linking = True flag_view = False if flag_file: col1, col2 = streamlit.columns(2) col1.write("## 👁️ XML tree view:") col2.write("## 👁️ Plain text view:") parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8') tree = etree.fromstring(data, parser=parser) xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8") col1.text_area("", value=xml, height=500, disabled=True) dids, sentences = ead_strategy(tree) plain = "\n".join(sentences) col2.text_area("", value=plain, height=500, disabled=True) flag_view = True if flag_view: streamlit.write("## ⚙️ Configure NER model and options:") models = [] for pipe in spacy.info()["pipelines"]: models.append(pipe) option = streamlit.selectbox( 'Choose a NER model you want to apply in the list: ', models) model = option if model != "": flag_model = True linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True) linkingicon = "✅️" if linking is False: linkingicon = "❌" streamlit.write("#### Actual Parameters:") streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}') entities = [] docs = [] ents = [] flag_vizualize = False # Launch NER process: if flag_model: if streamlit.button('Launch'): plain = "\n".join(sentences) with streamlit.spinner('Initialize NER...'): nlp = spacy.load(model) nlp.max_length = 5000000 if linking: nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']}) with streamlit.spinner('NER processing...'): if linking: start_sentence = 0 for doc in nlp.pipe(sentences, batch_size=250): end_sentence = start_sentence + len(doc.text) + 1 for ent in doc.ents: start_tok = start_sentence + ent.start_char end_tok = start_tok + len(ent.text) entities.append(( start_tok, end_tok, ent.text, ent.label_, ent._.kb_qid, ent._.url_wikidata, ent._.nerd_score )) start_sentence = end_sentence else: start_sentence = 0 for doc in nlp.pipe(sentences): end_sentence = start_sentence + len(doc.text) + 1 for ent in doc.ents: start_tok = start_sentence + ent.start_char end_tok = start_tok + len(ent.text) entities.append((start_tok, end_tok, ent.text, ent.label_, "", "", "" )) start_sentence = end_sentence streamlit.success('😃 NER applied with success!') df = pd.DataFrame(entities, columns=['START', 'END', 'MENTION', 'NER LABEL', 'QID', 'WIKIDATA RESSOURCE (wikidata disambiguation)', 'LINKING SCORE' ]) streamlit.write("## 🔎 Explore named entities in table: ") streamlit.write(df) streamlit.write("## 🔎 Explore named entities in text: ") spacy_streamlit.visualize_ner( {"text": plain, "ents": [{"start": ent[0], "end": ent[1], "label": ent[3], "kb_id": ent[4] if linking else "", "kb_url": ent[5] if linking else "" } for ent in entities]}, labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'], show_table=False, manual=True, title="", displacy_options={ "colors": { "EVENT": "#ec7063", "LOCATION": "#45b39d", "ORGANISATION": "#f39c12", "PERSON": "#3498db", "TITLE": "#a569bd ", "LOC": "#45b39d", "MISC": "#ec7063", "ORG": "#f39c12", "PER": "#3498db" } })