Spaces:

AMR-KELEG
/

ALDi

Running

App Files Files Community

AMR-KELEG commited on Apr 18

Commit

c0a63e1

1 Parent(s): 3563942

Show preprocessing options

Browse files

Files changed (1) hide show

app.py +25 -12

app.py CHANGED Viewed

@@ -12,24 +12,29 @@ import base64
 import re
-def preprocess_text(arabic_text):
     """Apply preprocessing to the given Arabic text.
     Args:
         arabic_text: The Arabic text to be preprocessed.
     Returns:
         The preprocessed Arabic text.
     """
-    no_urls = re.sub(
-        r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
-        "",
-        arabic_text,
-        flags=re.MULTILINE,
-    )
-    no_english = re.sub(r"[a-zA-Z]", "", no_urls)
-    return no_english
 @st.cache_data
@@ -57,7 +62,7 @@ tokenizer = AutoTokenizer.from_pretrained(constants.MODEL_NAME)
 model = load_model(constants.MODEL_NAME)
-def compute_ALDi(sentences):
     """Computes the ALDi score for the given sentences.
     Args:
@@ -72,7 +77,9 @@ def compute_ALDi(sentences):
     BATCH_SIZE = 4
     output_logits = []
-    preprocessed_sentences = [preprocess_text(s) for s in sentences]
     for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
         inputs = tokenizer(
@@ -101,6 +108,7 @@ def render_metadata():
     c = st.container()
     c.write(html, unsafe_allow_html=True)
 render_svg(open("assets/ALDi_logo.svg").read())
 render_metadata()
@@ -114,8 +122,13 @@ with tab1:
     # TODO: Check if this is needed!
     clicked = st.button("Submit")
     if sent:
-        ALDi_score = compute_ALDi([sent])[0]
         ORANGE_COLOR = "#FF8000"
         fig, ax = plt.subplots(figsize=(8, 1))

 import re
+def preprocess_text(arabic_text, remove_urls, remove_latin):
     """Apply preprocessing to the given Arabic text.
     Args:
         arabic_text: The Arabic text to be preprocessed.
+        remove_urls: Boolean indicating whether to remove URLs.
+        remove_latin: Boolean indicating whether to remove Latin characters.
     Returns:
         The preprocessed Arabic text.
     """
+    if remove_urls:
+        arabic_text = re.sub(
+            r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
+            "",
+            arabic_text,
+            flags=re.MULTILINE,
+        )
+    if remove_latin:
+        arabic_text = re.sub(r"[a-zA-Z]", "", arabic_text)
+    return arabic_text
 @st.cache_data
 model = load_model(constants.MODEL_NAME)
+def compute_ALDi(sentences, remove_urls=True, remove_latin=True):
     """Computes the ALDi score for the given sentences.
     Args:
     BATCH_SIZE = 4
     output_logits = []
+    preprocessed_sentences = [
+        preprocess_text(s, remove_urls, remove_latin) for s in sentences
+    ]
     for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
         inputs = tokenizer(
     c = st.container()
     c.write(html, unsafe_allow_html=True)
 render_svg(open("assets/ALDi_logo.svg").read())
 render_metadata()
     # TODO: Check if this is needed!
     clicked = st.button("Submit")
+    remove_urls = st.toggle("Remove urls", value=True)
+    remove_latin = st.toggle("Remove Latin characters", value=True)
     if sent:
+        ALDi_score = compute_ALDi(
+            [sent], remove_urls=remove_urls, remove_latin=remove_latin
+        )[0]
         ORANGE_COLOR = "#FF8000"
         fig, ax = plt.subplots(figsize=(8, 1))