Spaces:

ner4archives
/

ner4archives-NEL-vizualizer-app

Sleeping

App Files Files Community

lterriel commited on Dec 4, 2023

Commit

ac73442

1 Parent(s): 4c6d441

Update app.py

Browse files

Files changed (1) hide show

app.py +200 -168

app.py CHANGED Viewed

@@ -1,62 +1,40 @@
 import re
 import json
 import streamlit
 import spacy_streamlit
 import spacy
 from lxml import etree
 import pandas as pd
-streamlit.set_page_config(layout="wide")
-samples_test = {"FRAN_IR_050370.xml": "./samples/FRAN_IR_050370.xml"}
-with open('config.json', mode="r") as json_file:
     CONFIGURATION = json.loads(json_file.read())
-# TITLE APP
 streamlit.title("NER4Archives visualizer")
-streamlit.sidebar.title("NER4Archives visualizer")
-streamlit.sidebar.write("## Motivation")
-streamlit.sidebar.markdown("""<div style="text-align: justify;">
-<p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
-XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
-<p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
-extracted from XML EAD finding aids and test it on new data.<p>
-<p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
-framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
-Other models may be added in the future.</p>
-<p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
-NER4Archives - 2022</div>
-""", unsafe_allow_html=True)
-scol1, scol2 = streamlit.sidebar.columns(2)
-scol1.image("./assets/an.png", width=170)
-scol2.image("./assets/almanach_rouge-inria.png", width=100)
-flag_file = False
-# 1. User provides a XML EAD
-streamlit.write("## 📄 Input XML EAD:")
-filename = streamlit.file_uploader("Upload an XML EAD", type="xml")
-streamlit.markdown("or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
-data = ""
-flag_model = False
-if filename is not None:
-    data = filename.getvalue().decode("utf-8").encode("utf-8")
-    if len(data) > 0:
-        flag_file = True
 def ead_strategy(tree):
-    # create a container for sentences and dids
-    # elements
     sentences = []
     container_dids = []
     # get the <dsc> level
@@ -78,132 +56,186 @@ def ead_strategy(tree):
     # assert len(sentences) == len(container_dids)
     return container_dids, sentences
-model = ""
-linking = True
-flag_view = False
-if flag_file:
-    col1, col2 = streamlit.columns(2)
-    col1.write("## 👁️ XML tree view:")
-    col2.write("## 👁️ Plain text view:")
-    parser = etree.XMLParser(ns_clean=True, recover=True, encoding='utf-8')
     tree = etree.fromstring(data, parser=parser)
     xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
-    col1.text_area("", value=xml, height=500, disabled=True)
     dids, sentences = ead_strategy(tree)
-    plain = "\n".join(sentences)
-    col2.text_area("", value=plain, height=500, disabled=True)
-    flag_view = True
-if flag_view:
-    streamlit.write("## ⚙️ Configure NER model and options:")
-    models = []
-    for pipe in spacy.info()["pipelines"]:
-        models.append(pipe)
-    option = streamlit.selectbox(
-        'Choose a NER model you want to apply in the list: ',
-        models)
-    model = option
-    if model != "":
-        flag_model = True
-    #linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)', value=True)
-    #linkingicon = "✅️"
-    #if linking is False:
-    #    linkingicon = "❌"
-    linking = False
-    streamlit.write("#### Actual Parameters:")
-    #streamlit.write(f'- NER model selected: {option}\n - linking: {linkingicon}')
-    streamlit.write(f'- NER model selected: {option}\n')
-entities = []
-docs = []
-ents = []
-flag_vizualize = False
-# Launch NER process:
-if flag_model:
-    if streamlit.button('Launch'):
         plain = "\n".join(sentences)
-        with streamlit.spinner('Initialize NER...'):
-            nlp = spacy.load(model)
-            nlp.max_length = 5000000
-            if linking:
-                nlp.add_pipe('entityfishing', config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
-        with streamlit.spinner('NER processing...'):
-            if linking:
-                start_sentence = 0
-                for doc in nlp.pipe(sentences, batch_size=250):
-                    end_sentence = start_sentence + len(doc.text) + 1
-                    for ent in doc.ents:
-                        start_tok = start_sentence + ent.start_char
-                        end_tok = start_tok + len(ent.text)
-                        entities.append((
-                            start_tok,
-                            end_tok,
-                            ent.text,
-                            ent.label_,
-                            ent._.kb_qid,
-                            ent._.url_wikidata,
-                            ent._.nerd_score
-                        ))
-                    start_sentence = end_sentence
             else:
-                start_sentence = 0
-                for doc in nlp.pipe(sentences):
-                    end_sentence = start_sentence + len(doc.text) + 1
-                    for ent in doc.ents:
-                        start_tok = start_sentence + ent.start_char
-                        end_tok = start_tok + len(ent.text)
-                        entities.append((start_tok,
-                                          end_tok,
-                                          ent.text,
-                                          ent.label_,
-                                          "",
-                                          "",
-                                          ""
-                                          ))
-                    start_sentence = end_sentence
-        streamlit.success('😃 NER applied with success!')
-        df = pd.DataFrame(entities, columns=['START',
-                                             'END',
-                                             'MENTION',
-                                             'NER LABEL',
-                                             'QID',
-                                             'WIKIDATA RESSOURCE  (wikidata disambiguation)',
-                                             'LINKING SCORE'
-                                             ])
-        streamlit.write("## 🔎 Explore named entities in table: ")
-        streamlit.write(df)
-        streamlit.write("## 🔎 Explore named entities in text: ")
-        spacy_streamlit.visualize_ner(
-            {"text": plain,
-             "ents": [{"start": ent[0],
-                  "end": ent[1],
-                  "label": ent[3],
-                  "kb_id": ent[4] if linking else "",
-                  "kb_url": ent[5] if linking else ""
-                  } for ent in entities]},
-            labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
-            show_table=False,
-            manual=True,
-            title="",
-            displacy_options={
-                "colors": {
-                 "EVENT": "#ec7063",
-                 "LOCATION": "#45b39d",
-                 "ORGANISATION": "#f39c12",
-                 "PERSON": "#3498db",
-                 "TITLE": "#a569bd ",
-                 "LOC": "#45b39d",
-                 "MISC": "#ec7063",
-                 "ORG": "#f39c12",
-                 "PER": "#3498db"
-                }
-            })

 import re
 import json
+import requests
 import streamlit
 import spacy_streamlit
 import spacy
 from lxml import etree
 import pandas as pd
+# Constants
+CONFIG_FILE = "config.json"
+ASSETS_DIR = "./assets"
+XML_PARSER_CONFIG = {'ns_clean': True, 'recover': True, 'encoding': 'utf-8'}
+ENTITY_COLORS = {
+    "EVENT": "#ec7063",
+    "LOCATION": "#45b39d",
+    "ORGANISATION": "#f39c12",
+    "PERSON": "#3498db",
+    "TITLE": "#a569bd ",
+    "LOC": "#45b39d",
+    "MISC": "#ec7063",
+    "ORG": "#f39c12",
+    "PER": "#3498db"
+}
+# Read configuration
+with open(CONFIG_FILE, mode="r") as json_file:
     CONFIGURATION = json.loads(json_file.read())
+# Set up Streamlit page
+streamlit.set_page_config(layout="wide")
 streamlit.title("NER4Archives visualizer")
 def ead_strategy(tree):
     sentences = []
     container_dids = []
     # get the <dsc> level
     # assert len(sentences) == len(container_dids)
     return container_dids, sentences
+def process_xml(data):
+    parser = etree.XMLParser(**XML_PARSER_CONFIG)
     tree = etree.fromstring(data, parser=parser)
     xml = etree.tostring(tree, pretty_print=True, encoding="utf-8").decode("utf-8")
     dids, sentences = ead_strategy(tree)
+    return xml, dids, sentences
+def is_entity_fishing_online():
+    try:
+        response = requests.get("/".join(CONFIGURATION["ef_endpoint"].split("/")[:-1]))
+        if response.status_code == 200:
+            return True
+        else:
+            return False
+    except:
+        return False
+def setup_sidebar():
+    streamlit.sidebar.title("NER4Archives visualizer")
+    streamlit.sidebar.write("## Motivation")
+    streamlit.sidebar.markdown("""<div style="text-align: justify;">
+    <p>This application is a proof-of-concept to apply and evaluate text classification task (also called Named-Entity Recognition) on
+    XML <a href="https://www.loc.gov/ead/" target="_blank">EAD</a> <a href="https://fr.wikipedia.org/wiki/Instrument_de_recherche" target="_blank">finding aids</a> and evaluate NER predictions.</p>
+    <p>In the context of the <a href="https://github.com/NER4Archives-project" target="_blank">NER4Archives project</a> (INRIA-ALMAnaCH/Archives nationales), the goal is to train NER models on annotated dataset
+    extracted from XML EAD finding aids and test it on new data.<p>
+    <p>Most of the models available here are trained with the NLP <a href="https://spacy.io/" target="_blank">spaCy</a>
+    framework and are available on the <a href="https://huggingface.co/ner4archives" target="_blank">HF organisation hub</a>.
+    Other models may be added in the future.</p>
+    <p>The project also includes a downstream entity linking task. The <a href="https://github.com/Lucaterre/spacyfishing" target="_blank">SpaCy fishing</a> extension (based on <a href="https://github.com/kermitt2/entity-fishing" target="_blank">entity-fishing</a>) is used here to support this purpose.</p>
+    NER4Archives - 2022/2023</div>
+    """, unsafe_allow_html=True)
+    scol1, scol2 = streamlit.sidebar.columns(2)
+    scol1.image(f"{ASSETS_DIR}/an.png", width=170)
+    scol2.image(f"{ASSETS_DIR}/almanach_rouge-inria.png", width=100)
+def main():
+    setup_sidebar()
+    flag_file = False
+    flag_model = False
+    data = ""
+    model = ""
+    linking = True
+    entities = []
+    # 1. User provides a XML EAD
+    streamlit.write("## 📄 Input XML EAD:")
+    filename = streamlit.file_uploader("Upload an XML EAD (format .xml)", type="xml", label_visibility="collapsed")
+    streamlit.markdown(
+        "or use an XML EAD provided in [`samples/`](https://huggingface.co/spaces/ner4archives/ner4archives-NEL-vizualizer-app/blob/main/samples/) directory")
+    if filename is not None:
+        data = filename.getvalue().decode("utf-8").encode("utf-8")
+        if len(data) > 0:
+            flag_file = True
+    if flag_file:
+        col1, col2 = streamlit.columns(2)
+        col1.write("## 👁️ XML tree view:")
+        col2.write("## 👁️ Plain text view:")
+        xml, _, sentences = process_xml(data)
+        col1.text_area("XML Tree View (read-only)", value=xml, height=500, disabled=True)
         plain = "\n".join(sentences)
+        col2.text_area("Plain Text View (read-only)", value=plain, height=500, disabled=True)
+        flag_view = True
+        if flag_view:
+            streamlit.write("## ⚙️ Configure NER pipeline and options:")
+            streamlit.write("⚠️ Using Bert based model and/or linking may increase considerably the processing time.")
+            models = []
+            for pipe in spacy.info()["pipelines"]:
+                models.append(pipe)
+            option = streamlit.selectbox(
+                'Choose a NER model you want to apply in the list: ',
+                models)
+            model = option
+            if model != "":
+                flag_model = True
+            gpu = streamlit.checkbox('Check to use GPU (if available)', value=False)
+            gpu_icon = "❌"
+            if gpu:
+                spacy.prefer_gpu()
+                gpu_icon = "✅️"
+            else:
+                spacy.require_cpu()
+            if is_entity_fishing_online():
+                streamlit.write("Entity-fishing server status: 🟢 (you can use linking feature)")
+                linking = streamlit.checkbox('Check to apply named entity linking (entity-fishing component)',
+                                             value=False)
+                linkingicon = "✅️"
+                if linking is False:
+                    linkingicon = "❌"
             else:
+                streamlit.write("Entity-fishing server status: 🔴 (you can't use linking feature)")
+                linking = False
+                linkingicon = "❌"
+            streamlit.write("#### Actual Parameters:")
+            streamlit.write(f'- NER model selected: {option}\n - Linking activated: {linkingicon} - GPU activated: {gpu_icon}')
+        # Launch NER process:
+        if flag_model:
+            if streamlit.button('Launch'):
+                plain = "\n".join(sentences)
+                with streamlit.spinner('Initialize NER...'):
+                    nlp = spacy.load(model)
+                    nlp.max_length = 5000000
+                    if linking:
+                        nlp.add_pipe('entityfishing',
+                                     config={"language": "fr", "api_ef_base": CONFIGURATION['ef_endpoint']})
+                with streamlit.spinner('NER processing...'):
+                    if linking:
+                        start_sentence = 0
+                        for doc in nlp.pipe(sentences):
+                            end_sentence = start_sentence + len(doc.text) + 1
+                            for ent in doc.ents:
+                                start_tok = start_sentence + ent.start_char
+                                end_tok = start_tok + len(ent.text)
+                                entities.append((
+                                    start_tok,
+                                    end_tok,
+                                    ent.text,
+                                    ent.label_,
+                                    ent._.kb_qid,
+                                    ent._.url_wikidata,
+                                    ent._.nerd_score
+                                ))
+                            start_sentence = end_sentence
+                    else:
+                        start_sentence = 0
+                        for doc in nlp.pipe(sentences):
+                            end_sentence = start_sentence + len(doc.text) + 1
+                            for ent in doc.ents:
+                                start_tok = start_sentence + ent.start_char
+                                end_tok = start_tok + len(ent.text)
+                                entities.append((start_tok,
+                                                 end_tok,
+                                                 ent.text,
+                                                 ent.label_,
+                                                 "",
+                                                 "",
+                                                 ""
+                                                 ))
+                            start_sentence = end_sentence
+                streamlit.success('😃 NER applied with success!')
+                df = pd.DataFrame(entities, columns=['START',
+                                                     'END',
+                                                     'MENTION',
+                                                     'NER LABEL',
+                                                     'QID',
+                                                     'WIKIDATA RESSOURCE  (wikidata disambiguation)',
+                                                     'LINKING SCORE'
+                                                     ])
+                df[['START', 'END']] = df[['START', 'END']].astype(int)
+                streamlit.write("## 🔎 Explore named entities in table: ")
+                streamlit.write(df)
+                streamlit.write("## 🔎 Explore named entities in text: ")
+                spacy_streamlit.visualize_ner(
+                    {"text": plain,
+                     "ents": [{"start": ent[0],
+                               "end": ent[1],
+                               "label": ent[3],
+                               "kb_id": ent[4] if linking else "",
+                               "kb_url": ent[5] if linking else ""
+                               } for ent in entities]},
+                    labels=["EVENT", "LOCATION", "ORGANISATION", "PERSON", "TITLE", 'LOC', 'MISC', 'ORG', 'PER'],
+                    show_table=False,
+                    manual=True,
+                    title="",
+                    displacy_options={
+                        "colors": ENTITY_COLORS
+                    })
+if __name__ == "__main__":
+    main()