import re
import json
import streamlit
import spacy_streamlit
import spacy
from lxml import etree
import pandas as pd
streamlit.set_page_config(layout="wide")
samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
with open('config.json', mode="r") as json_file:
CONFIGURATION = json.loads(json_file.read())
# TITLE APP
streamlit.title("NER4Archives visualizer")
streamlit.sidebar.title("NER4Archives visualizer")
streamlit.sidebar.write("## Motivation")
streamlit.sidebar.markdown("""
This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
XML EAD finding aids and evaluate NER predictions.
In the context of the NER4Archives project (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
extracted from XML EAD finding aids and test it on new data.
Most of the models available here are trained with the NLP spaCy
framework and are available on the HF organisation hub.
Other models may be added in the future.
The project also includes a downstream entity linking task. The SpaCy fishing extension (based on entity-fishing) is used here to support this purpose.
NER4Archives - 2022
""", unsafe_allow_html=True)
scol1, scol2 = streamlit.sidebar.columns(2)
scol1.image("./assets/an.png", width=170)
scol2.image("./assets/almanach_rouge-inria.png", width=100)
flag_file = False
# 1. User provides a XML EAD
streamlit.write("## 📄 Input XML EAD:")
filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
data = ""
flag_model = False
if filename is not None:
data = filename.getvalue().decode("utf-8").encode("utf-8")
if len(data) > 0:
flag_file = True
def ead_strategy(tree):
# create a container for sentences and dids
# elements
sentences = []
container_dids = []
# get the level
dsc = tree.xpath('.//dsc')
for chlidren_dsc in dsc:
# get levels
for did in chlidren_dsc.xpath('.//did'):
container_dids.append(did)
text = ""
if did is not None:
text += " ".join(
[did_content.strip() for did_content in did.itertext() if len(did_content) > 0])
# get the scopecontent if exists and concatenate with the rest
if did.getnext() is not None:
text += " ".join(
[" ".join(scopecontent.strip().split()) for scopecontent in did.getnext().itertext() if
len(scopecontent) > 0])
sentences.append(" " + re.sub(r"\s{2,}", " ", text.strip()) + " ")
# assert len(sentences) == len(container_dids)
return container_dids, sentences
model = ""
linking = True
flag_view = False
if flag_file:
col1, col2 = streamlit.columns(2)
col1.write("## 👁️ XML tree view:")
col2.write("## 👁️ Plain text view:")
parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
tree = etree.fromstring(data, parser=parser)
xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
col1.text_area("", value=xml, height=500, disabled=True)
dids, sentences = ead_strategy(tree)
plain = "\n".join(sentences)
col2.text_area("", value=plain, height=500, disabled=True)
flag_view = True
if flag_view:
streamlit.write("## ⚙️ Configure NER model and options:")
models = []
for pipe in spacy.info()["pipelines"]:
models.append(pipe)
option = streamlit.selectbox(
'Choose a NER model you want to apply in the list: ',
models)
model = option
if model != "":
flag_model = True
linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
linkingicon = "✅️"
if linking is False:
linkingicon = "❌"
streamlit.write("#### Actual Parameters:")
streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
entities = []
docs = []
ents = []
flag_vizualize = False
# Launch NER process:
if flag_model:
if streamlit.button('Launch'):
plain = "\n".join(sentences)
with streamlit.spinner('Initialize NER...'):
nlp = spacy.load(model)
nlp.max_length = 5000000
if linking:
nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
with streamlit.spinner('NER processing...'):
if linking:
start_sentence = 0
for doc in nlp.pipe(sentences, batch_size=250):
end_sentence = start_sentence + len(doc.text) + 1
for ent in doc.ents:
start_tok = start_sentence + ent.start_char
end_tok = start_tok + len(ent.text)
entities.append((
start_tok,
end_tok,
ent.text,
ent.label_,
ent._.kb_qid,
ent._.url_wikidata,
ent._.nerd_score
))
start_sentence = end_sentence
else:
start_sentence = 0
for doc in nlp.pipe(sentences):
end_sentence = start_sentence + len(doc.text) + 1
for ent in doc.ents:
start_tok = start_sentence + ent.start_char
end_tok = start_tok + len(ent.text)
entities.append((start_tok,
end_tok,
ent.text,
ent.label_,
"",
"",
""
))
start_sentence = end_sentence
streamlit.success('😃 NER applied with success!')
df = pd.DataFrame(entities, columns=['START',
'END',
'MENTION',
'NER LABEL',
'QID',
'WIKIDATA RESSOURCE (wikidata disambiguation)',
'LINKING SCORE'
])
streamlit.write("## 🔎 Explore named entities in table: ")
streamlit.write(df)
streamlit.write("## 🔎 Explore named entities in text: ")
spacy_streamlit.visualize_ner(
{"text": plain,
"ents": [{"start": ent[0],
"end": ent[1],
"label": ent[3],
"kb_id": ent[4] if linking else "",
"kb_url": ent[5] if linking else ""
} for ent in entities]},
labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
show_table=False,
manual=True,
title="",
displacy_options={
"colors": {
"EVENT": "#ec7063",
"LOCATION": "#45b39d",
"ORGANISATION": "#f39c12",
"PERSON": "#3498db",
"TITLE": "#a569bd ",
"LOC": "#45b39d",
"MISC": "#ec7063",
"ORG": "#f39c12",
"PER": "#3498db"
}
})