Commit
·
febb7f1
1
Parent(s):
e5a848e
Upload normalizer.py
Browse files- normalizer.py +205 -0
normalizer.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from parsivar import Normalizer
|
| 2 |
+
|
| 3 |
+
import num2fawords
|
| 4 |
+
import re
|
| 5 |
+
import string
|
| 6 |
+
|
| 7 |
+
from dictionary import dictionary_mapping, fixator_dictionary
|
| 8 |
+
|
| 9 |
+
_normalizer = Normalizer(half_space_char="\u200c", statistical_space_correction=True)
|
| 10 |
+
chars_to_ignore = [
|
| 11 |
+
",", "?", ".", "!", "-", ";", ":", '""', "%", "'", '"', "�",
|
| 12 |
+
"#", "!", "؟", "?", "«", "»", "،", "(", ")", "؛", "'ٔ", "٬", 'ٔ', ",", "?",
|
| 13 |
+
".", "!", "-", ";", ":", '"', "“", "%", "‘", "”", "�", "–", "…", "_", "”", '“', '„',
|
| 14 |
+
'ā', 'š', 'ّ', 'ْ',
|
| 15 |
+
]
|
| 16 |
+
chars_to_ignore = chars_to_ignore + list(string.ascii_lowercase + string.digits)
|
| 17 |
+
chars_to_ignore = f"""[{"".join(chars_to_ignore)}]"""
|
| 18 |
+
zwnj = "\u200c"
|
| 19 |
+
silent_chars = ["ا", "د", "ذ", "ر", "ز", "و", "آ"] + [zwnj] + [" "]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def multiple_replace(text, chars_to_mapping):
|
| 23 |
+
pattern = "|".join(map(re.escape, chars_to_mapping.keys()))
|
| 24 |
+
return re.sub(pattern, lambda m: chars_to_mapping[m.group()], str(text))
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def remove_special_characters(text, chars_to_ignore_regex):
|
| 28 |
+
text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
|
| 29 |
+
return text
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def convert_word_nums_to_text(word):
|
| 33 |
+
try:
|
| 34 |
+
word = int(word)
|
| 35 |
+
word = num2fawords.words(word)
|
| 36 |
+
except:
|
| 37 |
+
word = word
|
| 38 |
+
|
| 39 |
+
return word
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def normalizer_at_word_level(text):
|
| 43 |
+
words = text.split()
|
| 44 |
+
_text = []
|
| 45 |
+
|
| 46 |
+
for word in words:
|
| 47 |
+
word = convert_word_nums_to_text(word)
|
| 48 |
+
word = fixator_dictionary.get(word, word)
|
| 49 |
+
|
| 50 |
+
_text.append(word)
|
| 51 |
+
|
| 52 |
+
return " ".join(_text) + " "
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def finder(ss, s, starter=False):
|
| 56 |
+
found = []
|
| 57 |
+
for m in re.finditer(ss, s):
|
| 58 |
+
if starter:
|
| 59 |
+
found.append(m.start())
|
| 60 |
+
else:
|
| 61 |
+
found.append((m.start(), m.end()))
|
| 62 |
+
|
| 63 |
+
return found
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def substring_replace(ss, s, start, end, stripped=True):
|
| 67 |
+
s_start = s[:start]
|
| 68 |
+
s_end = s[end:]
|
| 69 |
+
|
| 70 |
+
counter = 0
|
| 71 |
+
if stripped:
|
| 72 |
+
counter = 1 if s_start.endswith(" ") else counter
|
| 73 |
+
s_start = s_start.rstrip()
|
| 74 |
+
|
| 75 |
+
return s_start + ss + s_end, counter
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def normalizer(
|
| 79 |
+
batch,
|
| 80 |
+
is_normalize=True,
|
| 81 |
+
return_dict=True,
|
| 82 |
+
filter_trivials=False,
|
| 83 |
+
remove_extra_space=False
|
| 84 |
+
):
|
| 85 |
+
text = batch["sentence"].lower().strip()
|
| 86 |
+
|
| 87 |
+
# Parsivar normalizer
|
| 88 |
+
if is_normalize:
|
| 89 |
+
text = _normalizer.normalize(text)
|
| 90 |
+
|
| 91 |
+
# Dictionary mapping
|
| 92 |
+
text = multiple_replace(text, dictionary_mapping)
|
| 93 |
+
text = re.sub(" +", " ", text)
|
| 94 |
+
|
| 95 |
+
# Remove specials
|
| 96 |
+
text = remove_special_characters(text, chars_to_ignore)
|
| 97 |
+
text = re.sub(" +", " ", text)
|
| 98 |
+
|
| 99 |
+
# Replace connected آ
|
| 100 |
+
special, pointer = "آ", int("0")
|
| 101 |
+
for f in sorted(finder(special, text, True)):
|
| 102 |
+
index = f + pointer - 1
|
| 103 |
+
if len(text) >= index:
|
| 104 |
+
if text[index] not in silent_chars:
|
| 105 |
+
new_text, extra_pointer = substring_replace(
|
| 106 |
+
f"{text[index]}{zwnj}", text, index, index + 1, stripped=True)
|
| 107 |
+
text = new_text
|
| 108 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
| 109 |
+
|
| 110 |
+
# Replace connected ها
|
| 111 |
+
pointer = int("0")
|
| 112 |
+
special_list = [
|
| 113 |
+
# "ام", "ای", "است", "ایم", "اید", "اند",
|
| 114 |
+
"هایمان", "هایم", "هایت", "هایش",
|
| 115 |
+
"هایتان", "هایشان", "هام", "هات",
|
| 116 |
+
"هاتان", "هامون", "هامان", "هاش",
|
| 117 |
+
"هاتون", "هاشان", "هاشون",
|
| 118 |
+
"هایی", "های", "هاس", "ها"
|
| 119 |
+
]
|
| 120 |
+
for special in special_list:
|
| 121 |
+
pointer = 0
|
| 122 |
+
text = text
|
| 123 |
+
for f in sorted(finder(special, text, False)):
|
| 124 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
| 125 |
+
if len(text) >= (end + 1):
|
| 126 |
+
if len(text) == (end + 1):
|
| 127 |
+
new_text, extra_pointer = substring_replace(
|
| 128 |
+
f"{zwnj}{special}",
|
| 129 |
+
text,
|
| 130 |
+
start + 1,
|
| 131 |
+
end + 1,
|
| 132 |
+
stripped=True)
|
| 133 |
+
text = new_text
|
| 134 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
| 135 |
+
else:
|
| 136 |
+
if text[end + 1] == " ":
|
| 137 |
+
new_text, extra_pointer = substring_replace(
|
| 138 |
+
f"{zwnj}{special}",
|
| 139 |
+
text,
|
| 140 |
+
start + 1,
|
| 141 |
+
end + 1,
|
| 142 |
+
stripped=True)
|
| 143 |
+
text = new_text
|
| 144 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
| 145 |
+
|
| 146 |
+
special, pointer = "افزار", int("0")
|
| 147 |
+
for f in sorted(finder(special, text, False)):
|
| 148 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
| 149 |
+
|
| 150 |
+
if len(text) >= (end + 1):
|
| 151 |
+
new_text, extra_pointer = substring_replace(f"{zwnj}{special}", text, start + 1, end + 1, stripped=True)
|
| 152 |
+
text = new_text
|
| 153 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
| 154 |
+
|
| 155 |
+
# Replace connected ها
|
| 156 |
+
pointer = int("0")
|
| 157 |
+
special_list = [
|
| 158 |
+
"ترین", "تر"
|
| 159 |
+
]
|
| 160 |
+
for special in special_list:
|
| 161 |
+
pointer = 0
|
| 162 |
+
text = text
|
| 163 |
+
for f in sorted(finder(special, text, False)):
|
| 164 |
+
start, end = f[0] + pointer - 1, f[1] + pointer - 1
|
| 165 |
+
if len(text) >= (end + 1):
|
| 166 |
+
if len(text) == (end + 1):
|
| 167 |
+
new_text, extra_pointer = substring_replace(
|
| 168 |
+
f"{zwnj}{special}",
|
| 169 |
+
text,
|
| 170 |
+
start + 1,
|
| 171 |
+
end + 1,
|
| 172 |
+
stripped=True)
|
| 173 |
+
text = new_text
|
| 174 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
| 175 |
+
else:
|
| 176 |
+
if text[end + 1] == " ":
|
| 177 |
+
new_text, extra_pointer = substring_replace(
|
| 178 |
+
f"{zwnj}{special}",
|
| 179 |
+
text,
|
| 180 |
+
start + 1,
|
| 181 |
+
end + 1,
|
| 182 |
+
stripped=True)
|
| 183 |
+
text = new_text
|
| 184 |
+
pointer += 1 + 1 - 1 - extra_pointer
|
| 185 |
+
|
| 186 |
+
# Normalizer at word level
|
| 187 |
+
text = normalizer_at_word_level(text)
|
| 188 |
+
text = re.sub(" +", " ", text)
|
| 189 |
+
|
| 190 |
+
if remove_extra_space:
|
| 191 |
+
text = text.strip()
|
| 192 |
+
else:
|
| 193 |
+
text = text.strip() + " "
|
| 194 |
+
|
| 195 |
+
if filter_trivials:
|
| 196 |
+
if not len(text) > 2:
|
| 197 |
+
text = None
|
| 198 |
+
|
| 199 |
+
if not return_dict:
|
| 200 |
+
return text
|
| 201 |
+
|
| 202 |
+
batch["sentence"] = text
|
| 203 |
+
return batch
|
| 204 |
+
|
| 205 |
+
|