CUPE-2i / mapper.py
Tabahi's picture
Upload folder using huggingface_hub
d4f5bc1 verified
# this is a prepared index generated from the create_new_index() function
phoneme_mapped_index = {
# Special token
'SIL': 0,
# High front vowels and commonly confused similar vowels
'i': 1, # High front unrounded
'i:': 2, # Long high front unrounded
'ɨ': 3, # High central (grouped here due to high confusion with 'i')
'ɪ': 4, # Near-high front unrounded
# Mid front vowels
'e': 5, # Mid front unrounded
'e:': 6, # Long mid front unrounded
'ɛ': 7, # Open-mid front unrounded
# Central vowels
'ə': 8, # Schwa (mid central)
'ɚ': 9, # R-colored schwa
'ʌ': 10, # Open-mid back unrounded
# Back vowels
'u': 11, # High back rounded
'u:': 12, # Long high back rounded
'ʊ': 13, # Near-high back rounded
'ɯ': 14, # High back unrounded
'o': 15, # Mid back rounded
'o:': 16, # Long mid back rounded
'ɔ': 17, # Open-mid back rounded
# Low vowels
'a': 18, # Open central/front unrounded
'a:': 19, # Long open central/front unrounded
'æ': 20, # Near-open front unrounded
# Front rounded vowels
'y': 21, # High front rounded
'ø': 22, # Mid front rounded
# Diphthongs
'aɪ': 23, # Open central to high front
'eɪ': 24, # Mid front to high front
'aʊ': 25, # Open central to high back
'oʊ': 26, # Mid back to high back
'ɔɪ': 27, # Open-mid back to high front
# Stops (organized by place of articulation)
'p': 28, # Voiceless bilabial
'b': 29, # Voiced bilabial
't': 30, # Voiceless alveolar
'd': 31, # Voiced alveolar
'k': 32, # Voiceless velar
'g': 33, # Voiced velar
'q': 34, # Voiceless uvular
# Affricates and related sibilant fricatives (grouped by similarity)
'ts': 35, # Voiceless alveolar affricate
's': 36, # Voiceless alveolar fricative
'z': 37, # Voiced alveolar fricative
'tʃ': 38, # Voiceless postalveolar affricate
'dʒ': 39, # Voiced postalveolar affricate
'ʃ': 40, # Voiceless postalveolar fricative
'ʒ': 41, # Voiced postalveolar fricative
'ɕ': 42, # Voiceless alveolo-palatal fricative
# Other fricatives (organized by place)
'f': 43, # Voiceless labiodental
'v': 44, # Voiced labiodental
'θ': 45, # Voiceless dental
'ð': 46, # Voiced dental
'ç': 47, # Voiceless palatal
'x': 48, # Voiceless velar
'ɣ': 49, # Voiced velar
'h': 50, # Voiceless glottal
'ʁ': 51, # Voiced uvular
# Nasals (organized by place)
'm': 52, # Bilabial
'n': 53, # Alveolar
'ɲ': 54, # Palatal
'ŋ': 55, # Velar
# Liquids and approximants
'l': 56, # Alveolar lateral
'ɭ': 57, # Retroflex lateral
'ɾ': 58, # Alveolar tap
'ɹ': 59, # Alveolar approximant
'j': 60, # Palatal approximant
'w': 61, # Labial-velar approximant
# Palatalized consonants
'tʲ': 62, # Palatalized t
'nʲ': 63, # Palatalized n
'rʲ': 64, # Palatalized r
'ɭʲ': 65, # Palatalized retroflex lateral
# Special token
'noise': 66
}
phoneme_groups_mapper = {0: 0, 1: 1, 2: 1, 3: 3, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 3, 12: 3, 13: 3, 14: 3, 15: 3, 16: 3, 17: 3, 18: 4, 19: 4, 20: 4, 21: 1, 22: 1, 23: 5, 24: 5, 25: 5, 26: 5, 27: 5, 28: 6, 29: 7, 30: 6, 31: 7, 32: 6, 33: 7, 34: 6, 35: 10, 36: 8, 37: 9, 38: 10, 39: 11, 40: 8, 41: 9, 42: 8, 43: 8, 44: 9, 45: 8, 46: 9, 47: 8, 48: 8, 49: 9, 50: 8, 51: 9, 52: 12, 53: 12, 54: 12, 55: 12, 56: 13, 57: 13, 58: 14, 59: 14, 60: 15, 61: 15, 62: 6, 63: 12, 64: 14, 65: 13, 66: 16}
phoneme_groups_index = {'SIL': 0, 'front_vowels': 1, 'central_vowels': 2, 'back_vowels': 3, 'low_vowels': 4, 'diphthongs': 5, 'voiceless_stops': 6, 'voiced_stops': 7, 'voiceless_fricatives': 8, 'voiced_fricatives': 9, 'voiceless_affricates': 10, 'voiced_affricates': 11, 'nasals': 12, 'laterals': 13, 'rhotics': 14, 'glides': 15, 'noise': 16}
# vocab counts from train_100h*8_langs, OpenSLR-MLS data 100hours each
complete_vocab = {'SIL': 4914031, 'a': 1604572, 'n': 1501496, 't': 1345451, 's': 1242856, 'i': 1207390, 'e': 985568, 'o': 850470, 'm': 840466, 'l': 840200, 'r': 825931, 'd': 821689, 'k': 814493, 'ɛ': 700232, 'p': 607786, 'ə': 492948, 'v': 432914, 'j': 430514, 'u': 422499, 'ɾ': 419723, 'b': 413539, 'ɑ': 399576, 'ɔ': 344276, 'ʌ': 334025, 'ɪ': 294767, 'f': 292469, 'z': 286215, 'ɡ': 267606, 'ʃ': 249935, 'ɐ': 247989, 'w': 222719, 'ʊ': 200737, 'h': 189391, 'ʁ': 161395, 'ð': 159285, 'ɨ': 157902, 'x': 151422, 'eː': 141335, 'y': 138328, 'iː': 135167, 'ŋ': 118468, 'aɪ': 104170, 'ts': 96727, 'ɹ': 96111, 'æ': 83801, 'tʃ': 83484, 'θ': 81846, 'ʒ': 81251, 'uː': 79788, 'aː': 69487, 'ɕ': 68197, 'β': 67991, 'oː': 67190, 'ɑː': 66515, 'ɣ': 64902, 'eɪ': 63049, 'tʲ': 60946, 'ø': 58520, 'ɭ': 58455, 'nʲ': 56439, 'dʒ': 54175, 'ɑ̃': 53516, 'aʊ': 53297, 'q': 49319, 'ɲ': 48479, 'rʲ': 46853, 'ɭʲ': 45521, 'ɔ̃': 44985, 'ɯ': 43339, 'sʲ': 38535, 'ɲʲ': 38014, 'ɒ': 37682, 'vʲ': 37231, 'ʎ': 37145, 'ç': 35610, 'ʋ': 32705, 'ɚ': 32019, 'tɕ': 32006, 'mʲ': 31189, 'dʲ': 29989, 'ɜ': 27714, 'ja': 27523, 'ʔ': 26931, 'oʊ': 26398, 'ɑɨ': 24604, 'tʃʲ': 24301, '1': 23766, 'dʑ': 22131, 'ɛɪ': 21834, 'tː': 21127, 'ᵻ': 20631, 'ɛ̃': 20508, 'uɨ': 20396, 'ɫ': 19763, 'ɬ': 19662, 'ʑ': 18169, 'œ': 17838, 'oɪ': 16897, 'ɔɨ': 16178, 'ɔː': 15555, 'ɐ̃': 15510, 'ɨː': 15065, 'ɜː': 14661, 'ju': 14500, 'pʲ': 14429, 'aɨ': 13971, 'əl': 13858, 'ɵ': 13512, 'kʲ': 13204, 'ss': 12685, 'ɐ̃ʊ̃': 12237, 't[': 12211, 'əɪ': 12209, 'ɑːɹ': 10053, 'bʲ': 9958, 'd[': 9275, 'yː': 9033, 'eʊ': 8931, 'ɨu': 7653, 'ɡʲ': 7633, 'ɔːɹ': 7072, '(en)': 6245, 'œy': 6018, 'kː': 5801, 'əɨ': 5629, 'ɔø': 5613, 'oːɹ': 5458, 'u"': 5356, 'fʲ': 5315, 'pː': 5307, 'ɛɹ': 5299, 'ɪː': 5068, '??': 5027, 'ɛː': 4894, 'øː': 4796, 'ɔɪ': 4603, 'dZ': 4345, 'ɪu': 4320, 'c': 4200, 'S': 4197, 'ʕ': 4050, '(fr)': 3885, 'ʌʊ': 3787, 'tS': 3731, 'oe': 3730, 'iə': 3654, 'dʒː': 3441, 'ɪɹ': 3101, 'r̝̊': 3099, 'bː': 3051, 'ɟ': 2761, 'uɪ': 2632, 'ʊɹ': 2589, 'tʃː': 2564, 'ħ': 2318, 'ũ': 2158, 's^': 2080, 't̪': 1778, 'r̝': 1702, 'ɪ^': 1651, 'tsː': 1558, 'dzː': 1479, 'r̩': 1272, 'u:': 1189, 'aɪɚ': 1180, '(de)': 1166, 's̪': 1070, 'dz': 1027, 'iʊ': 940, 'aɪə': 928, 'dˤ': 920, 'χ': 845, 'æi': 819, 'œ̃': 766, '(it)': 719, 'ɑ:': 715, 'o:': 623, 'n̩': 587, 'l̩': 536, 'æː': 534, 'dː': 532, 'õ': 491, 'N': 490, 'y:': 415, 'pf': 342, 'əʊ': 342, 'ʝ': 323, 't^': 288, 'oe:': 257, '(nl)': 237, 'ɛʊ': 237, '(ptpt)': 196, 'e:': 183, 'eə': 144, 'd^': 131, 'i.ː': 129, 'yʊ': 101, 't^ː': 79, 'nl': 76, '(fa)': 65, 'æiː': 64, 'yi': 63, '(es)': 61, 'dʒʲ': 47, 'qː': 43, '(ru)': 40, 'ɡː': 37, 'ɪuː': 37, 'ʊə': 34, 'X': 31, 'a.ː': 31, 'u.ː': 31, 'rr': 25, 'mb': 25, 'ɵː': 24, 'd̪w': 23, 'ʂ': 22, '(tt)': 22, 'dm': 20, 'daː': 17, 'əː': 17, 'it': 17, 'ɡd': 17, 'mj': 16, 'db': 16, 'wb': 16, 'iːː': 15, 'mt': 15, 'ɑk': 15, 'i:': 15, 'da': 14, 'nb': 14, 'eð': 13, 'mx': 13, 'maː': 13, 'tk': 13, 'niː': 13, 'rb': 13, 'mh': 13, 'dˤdˤ': 13, 'fm': 13, 'nm': 11, 'eːh': 11, 'mtʃ': 11, 'ma': 11, 'ʊːt': 11, 'aːn': 11, 'iːe': 11, 'im': 10, 'eːb': 10, 'np': 10, 'aɪaɪ': 9, 'ʃj': 9, 'eːt': 9, 'jː': 8, 'mv': 8, 'ae': 8, 'ed': 8, 'nn': 8, 'dtʃ': 8, 'ɑh': 8, 'tr': 7, 'sb': 7, 'tn': 7, 'lx': 7, 'eːa': 7, 'il': 7, 'ɑj': 7, 'ɑaː': 7, 'oːs': 7, 'eːq': 7, 'ah': 7, 'bb': 7, 'or': 7, 'ɑm': 7, 'kt': 7, 'as': 7, 'eβ': 6, 'eːd': 6, 'ob': 6, 'eːuː': 6, 'rtʃ': 6, 'ʃd': 6, 'md': 6, 'duː': 6, 'eːaː': 6, 'ɡz': 6, 'hx': 6, 'lk': 6, 'eːf': 6, 'ɑq': 6, 'nk': 6, 'nx': 6, 'nj': 6, 'dɔ': 6, 'is': 6, 'aɪp': 6, 'əm': 6, 'laː': 5, 'tb': 5, 'aa': 5, 'zm': 5, 'ntʃ': 5, 'ep': 5, 'bh': 5, 'lh': 5, 'do': 5, 'eːp': 5, 'miː': 5, 'tuː': 5, 'lm': 5, 'mr': 5, 'sz': 5, 'nv': 4, 'in': 4, 'ɑuː': 4, 'iaː': 4, 'oːb': 4, 'uːm': 4, 'naː': 4, 'eːs': 4, 'itʃ': 4, 'eːv': 4, 'az': 4, 'ha': 4, 'dɡ': 4, 'hb': 4, 'mf': 4, 'ɑn': 4, 'ia': 4, 'lb': 4, 'na': 4, 'ld': 4, 'd̪': 4, 'ndʒ': 3, 'uːk': 3, 'uːb': 3, 'mo': 3, 'ɡh': 3, 'mk': 3, 'dk': 3, 'eːtʃ': 3, 'dp': 3, 'tiː': 3, 'bv': 3, 'ta': 3, 'th': 3, 'ip': 3, 'eːj': 3, 'eːx': 3, 'mz': 3, 'eːz': 3, '(pl)': 2, 'diː': 2, 'eh': 2, 'biː': 2, 'mʃ': 2, 'mn': 2, 'dʒv': 2, 'hʃ': 2, 'liː': 2, 'mɡ': 2, 'jiː': 2, 'ms': 2, 'ʃz': 2, 'ne': 2, 'on': 2, 'raː': 2, 'nh': 2, 'eːʃ': 2, 'rm': 2, 'ʊː': 2, 'laɪ': 1, 'ɑa': 1, 'td': 1, 'eːe': 1, 'mp': 1, 'ɑt': 1, 'nr': 1, 'me': 1, 'ɑo': 1, 'ik': 1, 'iːv': 1, 'dh': 1, 'eːɑ': 1, 'dl': 1, 'ʃdʒ': 1, 'ns': 1}
base_phonemes = [
# New:
'x', 'ç', 'ɣ', 'ʁ', 'ts', 'tʃ', 'ʌ', 'ɭ',
# Stops
'p', 'b', 't', 'd', 'k', 'ɡ', 'ʔ',
# Fricatives
'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'ʂ', 'ʐ', 'ɕ', 'ʑ', 'ʁ',
# Affricates
'ts', 'dz', 'tɕ', 'dʑ', 'tʃ', 'dʒ', 'pf',
# Nasals
'm', 'n', 'ŋ',
# Liquids/Glides
'l', 'ɫ', 'ɹ', 'j', 'w', 'ɾ',
# Palatalized
'nʲ', 'lʲ', 'rʲ',
# Vowels
'i', 'iː', 'ɪ', 'e', 'eː', 'ɛ', 'æ',
'a', 'aː', 'ɑː',
'o', 'ɔː', 'ʊ',
'u', 'uː',
'ə', 'ɚ', 'ʌ', 'ɨ', 'ɤ', 'ɯ',
'œ', 'ø', 'øː', 'ʏ', 'yː',
# Diphthongs
'eɪ', 'aɪ', 'ɔɪ', 'oʊ', 'aʊ',
# Other
'ɒ', 'ɜː', 'ɲ',
'noise', 'SIL'
]
phoneme_mapping = {
# Core vowels - simplified based on confusion patterns
'ə': 'ə',
#'ʌ': 'ə', # Merge due to high confusion
'ʌ': 'ʌ', # didn't work well before but still keep it
'ɪ': 'ɪ', 'i': 'i',
'iː': 'i:',
'ʊ': 'ʊ',
'u': 'u',
'uː': 'u:',
'ɛ': 'ɛ', 'e': 'e', 'eː': 'e:',
'ɔː': 'ɔ', 'ɔ': 'ɔ',
#'ɒ': 'ɒ', # Merge to 'a' due to 100% wrong predictions in confusion matrix (23 Jan)
'ɒ': 'a',
'æ': 'æ', # DO NOT merge
'ɑː': 'a:',
'ɑ': 'a',
'a': 'a',
'ɜː': 'ʌ',
'ɜ': 'ʌ',
'ɚ': 'ɚ',
'o': 'o',
'ɨ': 'ɨ',
# Common diphthongs - keep distinct ones
'eɪ': 'eɪ', 'aɪ': 'aɪ', 'ɔɪ': 'ɔɪ',
'aʊ': 'aʊ', 'oʊ': 'oʊ',
# Less common diphthongs - map to similar common ones
'ʌʊ': 'aʊ', 'eʊ': 'aʊ', 'ɛʊ': 'aʊ', 'əʊ': 'oʊ',
'ɛɪ': 'eɪ', 'ʊɪ': 'aɪ', 'ea': 'eɪ',
'aʊ̯': 'aʊ', 'aɪ̯': 'aɪ', 'ɔʏ̯': 'ɔɪ',
# Core consonants
'p': 'p', 'b': 'b', 't': 't', 'd': 'd',
'k': 'k', 'g': 'g', 'm': 'm', 'n': 'n',
'ŋ': 'ŋ', 'f': 'f', 'v': 'v', 'θ': 'θ',
'ð': 'ð', 's': 's', 'z': 'z', 'ʃ': 'ʃ',
'ʒ': 'ʒ', 'h': 'h', 'l': 'l', 'ɹ': 'ɹ',
'j': 'j', 'w': 'w', 'ɲ': 'ɲ', 'ɾ': 'ɾ',
# Consonant mergers based on confusion
# 'ɣ': 'g', # Merge with closest stop
'ɣ': 'ɣ', # emprically confused but will keep it
#'ʁ': 'ɹ', # Map to rhotic
'ʁ': 'ʁ',
'r': 'ɹ', # Map to rhotic
#'x': 'h', # Map to closest fricative
'x': 'x',
#'ç': 'ʃ', # Map to closest fricative
#'ç': 's', # Based on empirical confusion
'ç': 'ç',
'ʂ': 'ʃ', # Map to closest fricative
'ʐ': 'ʒ', # Map to closest fricative
#'ɕ': 'ʃ', # Map to closest fricative
'ɕ': 'ɕ', # keep it
'ʑ': 'ʒ', # Map to closest fricative
# Simplify affricates to their primary component
#'ts': 't',
'ts': 'ts',
'dz': 'dʒ',
#'tʃ': 'ʃ',
'tʃ': 'tʃ',
#'dʒ': 'ʒ',
'dʒ': 'dʒ',
'tɕ': 'tʃ',
'dʑ': 'dʒ',
'pf': 'f',
#'tʲ': 't',
'tʲ': 'tʲ', # high freuqncy, keep it
#'nʲ': 'n',
'nʲ': 'nʲ', # high freuqncy, keep it
#'rʲ': 'ɹ',
'rʲ': 'rʲ', # high freuqncy, keep it
# Remove palatalization
'lʲ': 'l',
'dʲ': 'd', 'sʲ': 's', 'vʲ': 'v',
'fʲ': 'f', 'mʲ': 'm',
'pʲ': 'p', 'kʲ': 'k', 'bʲ': 'b',
'ɲʲ': 'ɲ', 'dʒʲ': 'dʒ',
# Simplify geminate consonants
'tː': 't', 'dː': 'd', 'kː': 'k',
'gː': 'g', 'pː': 'p', 'bː': 'b',
'fː': 'f', 'vː': 'v', 'sː': 's',
'zː': 'z', 'ʃː': 'ʃ', 'ʒː': 'ʒ',
'mː': 'm', 'nː': 'n', 'ŋː': 'ŋ',
'lː': 'l', 'rː': 'ɹ', 'jː': 'j',
# Nasal vowels to oral counterparts
'ɑ̃': 'a', 'ɛ̃': 'ɛ', 'ɔ̃': 'ɔ',
'ũ': 'u', 'õ': 'oʊ', 'ɐ̃': 'ʌ',
# R-colored vowels
'ɑːɹ': 'ɚ', 'ɔːɹ': 'ɚ',
'ʊɹ': 'ɚ', 'ɪɹ': 'ɚ', 'ɛɹ': 'ɚ',
'oːɹ': 'ɚ',
# Vowel sequences
'ia': 'i:', 'ua': 'u:',
'ɔø': 'ɔ', 'iːɛ': 'i:',
'ʊə': 'ʊ', 'iə': 'i:',
'eə': 'ɛ',
# Common sequences
# 'əl': 'əl', # Keep this distinct sequence
#'əl': 'o', # based on empirical confusion. theoretically, this should be merged with 'l' or 'e' but it's most confused with 'o'
'əl': 'l',
'n̩': 'n',
'ʃf': 'ʃ',
'eð': 'ð',
'ns': 'n',
'nd': 'n',
'ʃts': 'ts',
# Special symbols
'SIL': 'SIL',
'noise': 'noise', # noise will be ignored by the model. CTC will take it as blank token.
'': 'SIL',
'ʔ': 'noise',
# Language markers to silence
'(en)': 'SIL', '(es)': 'SIL', '(fr)': 'SIL',
'(de)': 'SIL', '(it)': 'SIL', '(nl)': 'SIL',
'(pl)': 'SIL', '(ru)': 'SIL', '(ptpt)': 'SIL',
# Error cases to noise
'??': 'noise', 'uk': 'noise', 'it': 'noise',
'ɡd': 'noise', 'rd': 'noise', 'as': 'noise',
'up': 'noise', 'os': 'noise', 'kf': 'noise',
'1': 'noise', 'ʃd': 'noise', 'ʃz': 'noise',
'ʃn': 'noise',
# Vowels
'y': 'y', # Map to existing long form
'yː': 'y', # Keep distinct high front rounded vowel
'œ': 'ø', # Map to closest unrounded vowel
'ø': 'ø', # Map to long version
'øː': 'ø', # Keep distinct mid front rounded vowel
'ɐ': 'ʌ', # Map to schwa
'aː': 'a:', # Keep long a
#'oː': 'ɔ', # Map to similar long vowel
'oː': 'o:', # Keep distinct long o
'ɛː': 'ɛ', # Map to base form
'ɪː': 'i:', # Map to similar long vowel
'ɵ': 'ʊ', # Map to closest vowel
'ᵻ': 'ɪ', # Map to similar vowel
# Double vowels (map to their long counterparts)
'aa': 'a',
'ɐɐ': 'a',
'ææ': 'æ',
# Diphthongs
'yʊ': 'u', # Map to similar monophthong
'œy': 'ɔɪ', # Map to similar diphthong
'uɪ': 'aɪ', # Map to existing diphthong
'oɪ': 'ɔɪ', # Map to similar diphthong
'iʊ': 'u', # Map to similar monophthong
'aɪə': 'aɪ', # Map to base diphthong
'aɪɚ': 'aɪ', # Map to base diphthong
# Nasal vowels
'ɐ̃ʊ̃': 'aʊ', # Map to oral diphthong
'œ̃': 'ɛ', # Map to oral vowel
# Consonants
'ʝ': 'j', # Map to similar approximant
'ɟ': 'ʒ', # Map to similar affricate
'ʋ': 'v', # Map to similar fricative
'd̪': 'd', # Map dental to alveolar
't̪': 't', # Map dental to alveolar
'ɬ': 'l', # Map to plain lateral
'ʎ': 'l', # Map to plain lateral
'β': 'v', # Map to similar fricative
'ɡ': 'g', # Standardize to 'g'
# Geminate consonants
'ɡː': 'g', # Map to single consonant
'tsː': 'ts', # Map to single affricate
'dzː': 'd', # Map to single affricate
#'tʃː': 'ʃ', # Map to single affricate
'tʃː': 'tʃ', # Map to single affricate
'dʒː': 'dʒ', # Map to single affricate
'ss': 's', # Map to single consonant
# Palatalized consonants
'ɡʲ': 'g', # Map to plain consonant
# Sequences
'dɔ': 'noise', # Map unusual sequence to noise
# These are found (with counts) in Google MSWC data, but not in the OpenSLR-MLS data
# Complex sequences with frequency counts
'ja': 'j', # Common sequence (36,809) -> simplify to first component
'ju': 'j', # Common sequence (19,620) -> simplify to first component
'tʃʲ': 'tʃ', # Common palatalized affricate (32,707) -> map to fricative
#'ɭ': 'l', # Very common retroflex lateral (78,504) -> map to alveolar
'ɭ': 'ɭ',
'ɭʲ': 'ɭʲ', # Common palatalized retroflex (61,298) -> map to plain lateral
'u"': 'u', # Quote variant (7,265) -> normalize to standard long u
'ɪ^': 'ɪ', # Rare diacritic variant (2,222) -> remove diacritic
'sz': 's', # Rare sequence (5) -> simplify to first component
#'q': 'k', # Common uvular stop (75,838) -> map to velar
'q': 'q', # keep it EVEN though it's relively rare (45k)
#'qː': 'k', # Rare long uvular (103) -> map to velar
'qː': 'q',
'r̝̊': 'ɹ', # Rare trilled/fricative r (3,099) -> map to approximant
'r̝': 'ɹ', # Rare variant (1,702) -> map to approximant
'r̩': 'ɹ', # Rare syllabic (1,272) -> map to approximant
'l̩': 'l', # Rare syllabic (536) -> map to standard lateral
'c': 'k', # Uncommon palatal stop (4,195) -> map to velar
# Vowel sequences
'uɨ': 'ɨ', # Common sequence (20,396) -> map to monophthong
'aɨ': 'aɪ', # Common sequence (13,971) -> map to similar diphthong
'ɨu': 'u:', # Less common (7,653) -> map to monophthong
'ɪu': 'u:', # Uncommon (4,320) -> map to monophthong
'ɨː': 'ɨ', # Common variant (15,065) -> remove length marker
'ɑɨ': 'aɪ', # Common sequence (24,604) -> map to diphthong
'əɪ': 'eɪ', # Common sequence (12,209) -> map to similar diphthong
'əɨ': 'ɨ', # Less common (5,629) -> simplify to first component
'ɔɨ': 'ɔɪ', # Common sequence (16,178) -> map to similar diphthong
'ɪuː': 'u:', # Rare sequence (37) -> map to monophthong
# Rare sequences: 1-5 occurrences ---------------------------------------
# Some of the extremely rare consonant-consonant and vowel-consonant sequences map to 'noise' (i.e., ignored), most don't.
# More nasal sequences
'nm': 'n', # was 'noise', map to alveolar nasal
'nn': 'n', # was 'noise', map to single nasal
'mn': 'm', # was 'noise', map to bilabial nasal
'mm': 'm', # was 'noise', map to single nasal
'na': 'n', # was 'noise', preserve nasal
'maː': 'm', # was 'noise', preserve nasal
'mz': 'm', # was 'noise', preserve nasal
'ms': 'm', # was 'noise', preserve nasal
'mf': 'm', # was 'noise', preserve nasal
'mɡ': 'm', # was 'noise', preserve nasal
'mx': 'm', # was 'noise', preserve nasal
'mv': 'm', # was 'noise', preserve nasal
'mʃ': 'm', # current mapping is good
# Stop sequences
'dk': 'd', # was 'noise', preserve first stop
'dp': 'd', # was 'noise', preserve first stop
'db': 'd', # was 'noise', preserve first stop
'td': 't', # was 'noise', preserve first stop
'tb': 't', # was 'noise', preserve first stop
'tn': 't', # was 'noise', preserve stop
# Long vowel sequences
'eːs': 'e:', # was 'noise', preserve long vowel
'eːt': 'e:', # was 'noise', preserve long vowel
'eːp': 'e:', # was 'noise', preserve long vowel
'eːf': 'e:', # current mapping is good
'eːz': 'e:', # current mapping is good
'eːj': 'e:', # current mapping is good
'eːx': 'e:', # current mapping is good
'eːʃ': 'e:', # current mapping is good
'oːs': 'o:', # current mapping is good
'oːb': 'o:', # current mapping is good
# Vowel sequences
'ɑj': 'aɪ', # was 'noise', map to diphthong
'ɑh': 'a', # was 'noise', preserve vowel
'ɑm': 'a', # was 'noise', preserve vowel
'ɑk': 'a', # was 'noise', preserve vowel
'ɑn': 'a', # was 'noise', preserve vowel
'ɑq': 'a', # was 'noise', preserve vowel
'ɑt': 'a', # was 'noise', preserve vowel
'ɑo': 'a', # was 'noise', preserve first vowel
'ɑa': 'a', # was 'noise', preserve first vowel
'ɑaː': 'a:', # was 'noise', map to long vowel
'ɑuː': 'aʊ', # was 'noise', map to diphthong
# Other sequences
'dʒv': 'dʒ', # current mapping is good
'bv': 'b', # was 'noise', preserve stop
'bh': 'b', # was 'noise', preserve stop
'ɡh': 'g', # was 'noise', preserve stop
'ɡz': 'g', # was 'noise', preserve stop
'hx': 'x', # current mapping is good
'ʃj': 'ʃ', # was 'noise', preserve fricative
# Special cases
'(fa)': 'SIL', # current mapping is good
'bb': 'b', # current mapping is good
'uːb': 'u:',
'uːk': 'u:',
'laɪ': 'noise',
# --------------------------------------- End of rare sequences
# Vowels and length variants
'əː': 'ə', # Long schwa maps to schwa (index 18)
'æː': 'æ', # Long ash maps to ash (index 32)
'æi': 'eɪ', # Map to similar diphthong (index 23)
'æiː': 'eɪ', # Map to similar diphthong (index 23)
'ɵː': 'ʊ', # Long rounded vowel maps to nearest equivalent (index 22)
#'ɯ': 'ʊ', # Unrounded high back vowel maps to nearest equivalent (index 22)
'ɯ': 'ɯ',
# Alternative transcription formats
'e:': 'e:', # Long e
'eː': 'e:', # Normalize colon to IPA length mark (index 43)
#'e:': 'e', # NOT merged due to high confusion
'o:': 'o:',
'y:': 'y', # Normalize colon to IPA length mark (index 39)
'u:': 'u:', # Normalize colon to IPA length mark (index 5)
'i:': 'i:', # Normalize colon to IPA length mark (index 12)
'ɑ:': 'a', # Normalize colon to IPA length mark (index 13)
'oe:': 'ø', # Normalize colon to IPA length mark (index 40)
'oe': 'ø', # Map to equivalent (index 40)
# ASCII-based transcription variants
'S': 's', # ASCII variant of 's' (index 21)
'N': 'n', # ASCII variant of 'n' (index 11)
'X': 'k', # ASCII variant, typically representing 'k' (index 27)
'tS': 'tʃ', # ASCII variant of 'tʃ' (index 1)
'dZ': 'dʒ', # ASCII variant of 'dʒ' (index 2)
# Special characters and diacritics
't^': 't', # Remove diacritic (index 4)
's^': 's', # Remove diacritic (index 21)
'd^': 'd', # Remove diacritic (index 9)
't^ː': 't', # Remove diacritic and length (index 4)
't[': 't', # Remove bracket notation (index 4)
'd[': 'd', # Remove bracket notation (index 9)
# Arabic phonemes
'ʕ': 'h', # Voiced pharyngeal fricative maps to nearest fricative (index 37)
'ħ': 'h', # Voiceless pharyngeal fricative maps to 'h' (index 37)
'dˤ': 'd', # Pharyngealized 'd' maps to plain 'd' (index 9)
's̪': 's', # Dental 's' maps to plain 's' (index 21)
'χ': 'x', # Voiceless uvular fricative maps to 'h' (index 37)
'dˤdˤ': 'd', # Doubled pharyngealized 'd' maps to 'd' (index 9)
'dd': 'd', # ASCII variant of doubled/pharyngealized 'd' (index 9)
# Dot notation variants
'i.ː': 'i:', # Normalize dot notation (index 12)
'a.ː': 'a:', # Normalize dot notation (index 13)
'u.ː': 'u:', # Normalize dot notation (index 5)
# Lateral approximant variant
'ɫ': 'l', # Velarized lateral maps to plain 'l' (index 16)
# Consonant sequences (map to noise)
'kt': 'noise', # Consonant sequence (index 50)
'd̪w': 'noise', # Consonant sequence (index 50)
'wb': 'noise', # Consonant sequence (index 50)
'fm': 'noise', # Consonant sequence (index 50)
# Vowel-consonant sequences (map to noise)
'ʊːt': 'noise', # Vowel-consonant sequence (index 50)
'aɪp': 'noise', # Vowel-consonant sequence (index 50)
'əm': 'noise', # Vowel-consonant sequence (index 50)
'aːn': 'a:', # Vowel-consonant sequence (index 50)
'iːe': 'i:', # Vowel-vowel sequence (index 50)
'yi': 'i:', # Vowel-vowel sequence (index 50)
# Language markers (map to SIL)
'(tt)': 'SIL', # Language marker (index 0)
# Double long vowel - map to standard long vowel
'iːː': 'i:', # Excessive length mark, normalize to standard long i (index 12)
# Doubled diphthong - map to single diphthong
'aɪaɪ': 'aɪ', # Repeated diphthong, map to single instance (index 7)
# Consonant sequences - map to noise like other sequences
'ndʒ': 'dʒ', # Consonant cluster (index 50)
'tr': 'noise', # Consonant cluster (index 50)
'eβ': 'noise', # Vowel-consonant sequence (index 50)
# Double palatalization - map to single palatalized form then apply existing mappings
'ʂʲ': 'ʃ', # Map palatalized retroflex to palato-alveolar (index 1)
'nʲʲ': 'nʲ', # Double palatalized nasal to plain nasal (index 11)
'tsʲ': 'ts', # Palatalized affricate follows affricate mapping (index 4)
'xʲ': 'h', # Palatalized velar fricative to h (index 37)
'dʑʲ': 'dʒ', # Palatalized voiced affricate to voiced palato-alveolar (index 2)
'ɕʲ': 'ɕ', # Palatalized alveolo-palatal to palato-alveolar (index 1)
'tɕʲ': 'ʃ', # Palatalized affricate to palato-alveolar (index 1)
'tʲʲ': 'tʲ', # Double palatalized stop to plain stop (index 4)
'ʒʲ': 'ʒ', # Palatalized palato-alveolar remains (index 2)
'ʃʲʲ': 'ʃ', # Double palatalized palato-alveolar remains (index 1)
'tsʲʲ': 'ts', # Double palatalized affricate to stop (index 4)
'ɾʲʲ': 'ɾ', # Double palatalized tap remains (index 48)
'zʲʲ': 'z', # Double palatalized fricative remains (index 36)
'ɾʲ': 'rʲ', # Palatalized tap remains (index 48)
'ʃʲ': 'ʃ', # Palatalized palato-alveolar remains (index 1)
'mʲʲ': 'm', # Double palatalized nasal to plain (index 28)
'ʲ': 'noise', # Isolated palatalization mark to noise (index 50)
# Vowel sequences - map to nearest phoneme or diphthong
'uo': 'oʊ', # Map to nearest diphthong (index 24)
'ee': 'i:', # Map to long vowel (index 12)
'ie': 'i:', # Map to long vowel (index 12)
'ai': 'aɪ', # Map to standard diphthong (index 7)
'ui': 'u:', # Map to long vowel (index 5)
'au': 'aʊ', # Map to standard diphthong (index 8)
'eɑ': 'ɛ', # Map to nearest monophthong (index 6)
'iu': 'u:', # Map to long vowel (index 5)
'auː': 'aʊ', # Map to standard diphthong (index 8)
'ei': 'eɪ', # Map to standard diphthong (index 23)
'eu': 'oʊ', # Map to nearest diphthong (index 24)
'aiː': 'aɪ', # Map to standard diphthong (index 7)
'iuː': 'u:', # Map to long vowel (index 5)
'eiː': 'eɪ', # Map to standard diphthong (index 23)
'euː': 'oʊ', # Map to nearest diphthong (index 24)
'ɔa': 'ɔ', # Map to long vowel (index 3)
'yɪ': 'y', # Map to long vowel (index 39)
'iɪ': 'i:', # Map to long vowel (index 12)
'eo': 'oʊ', # Map to nearest diphthong (index 24)
# Special notations
'cː': 'k', # Long palatal stop to velar (index 27)
# All Chinese tonal patterns (with numbers) and complex sequences map to 'noise'
# Examples:
'iɜk': 'noise', 'onɡ5': 'noise', 'ts.': 'ts', 'ə5': 'noise',
'ŋf': 'noise', 'u2': 'noise', 'oɜɕ': 'noise', 'iɜ': 'noise',
# MLS-fr
# Consonant sequences to noise
'ls': 'noise', # Lateral + fricative sequence maps to noise (50)
'll': 'noise', # Double lateral sequence maps to noise (50)
# Vowel-consonant sequences to noise
'øːl': 'noise', # Long oe + lateral sequence maps to noise (50)
'øːs': 'noise', # Long oe + fricative sequence maps to noise (50)
# from UCLA Phonetics Dataset
# Syllabic consonants - map to their non-syllabic counterparts
'h̩': 'h', # Syllabic h to h (37)
'ɹ̩': 'ɹ', # Syllabic r to r (17)
'ŋ̩': 'ŋ', # Syllabic ng to ng (34)
'ɫ̩': 'l', # Syllabic dark l to l (16)
'v̩': 'v', # Syllabic v to v (15)
'm̩': 'm', # Syllabic m to m (28)
# Aspirated consonants - map to unaspirated counterparts
'pʰ': 'p', # Aspirated p to p (25)
'tʰ': 't', # Aspirated t to t (4)
'kʰ': 'k', # Aspirated k to k (27)
'sʰ': 's', # Aspirated s to s (21)
'ʃʰ': 'ʃ', # Aspirated sh to sh (1)
'cʰ': 'k', # Aspirated c to k (27)
't͡sʰ': 'ts', # Aspirated ts to t (4)
't͡ʃʰ': 'tʃ', # Aspirated tsh to sh (1)
'ɕʰ': 'ɕ', # Aspirated alveolo-palatal to sh (1)
# Labialized consonants - map to base consonants
'tʷ': 't', # Labialized t to t (4)
'kʷ': 'k', # Labialized k to k (27)
'pʷ': 'p', # Labialized p to p (25)
'ʒʷ': 'ʒ', # Labialized zh to zh (2)
'xʷ': 'h', # Labialized x to h (37)
'dʷ': 'd', # Labialized d to d (9)
'bʷ': 'b', # Labialized b to b (26)
'mʷ': 'm', # Labialized m to m (28)
'ŋʷ': 'ŋ', # Labialized ng to ng (34)
# Retroflexes - map to closest non-retroflex
'ʈ': 't', # Retroflex t to t (4)
'ɖ': 'd', # Retroflex d to d (9)
'ɳ': 'n', # Retroflex n to n (11)
'ɻ': 'ɹ', # Retroflex r to r (17)
'ɽ': 'ɾ', # Retroflex flap to tap (48)
# Breathy voiced - map to regular voiced
'n̤': 'n', # Breathy n to n (11)
'b̤': 'b', # Breathy b to b (26)
'j̤': 'j', # Breathy j to j (29)
'a̤': 'a', # Breathy a to long a (30)
'i̤ː': 'i:', # Breathy long i to long i (12)
'o̤': 'o', # Breathy o to o (44)
'o̤ː': 'o:', # Breathy long o to o (44)
# Nasalized vowels - map to oral counterparts
'ãː': 'a:', # Nasalized long a to long a (30)
'ẽ': 'e', # Nasalized e to e (42)
'ɪ̃': 'ɪ', # Nasalized short i to short i (31)
'ỹ': 'y', # Nasalized y to long y (39)
'õː': 'o:', # Nasalized long o to o (44)
'æ̃': 'æ', # Nasalized ae to ae (32)
'ʌ̃': 'ʌ', # Nasalized wedge to schwa (18)
'ə̃': 'ə', # Nasalized schwa to schwa (18)
'ã': 'a', # Nasalized a to long a (30)
'ĩ': 'i:', # Nasalized i to long i (12)
'ĩː': 'i:', # Nasalized long i to long i (12)
'ũː': 'u:', # Nasalized long u to long u (5)
# Affricates - map to primary component
't͡s': 'ts', # ts to t (4)
't͡ʃ': 'tʃ', # tsh to sh (1)
'd͡ʒ': 'dʒ', # dzh to zh (2)
't͡ɬ': 't', # tl to t (4)
# Ejectives - map to non-ejective counterparts
'tʼ': 't', # Ejective t to t (4)
'kʼ': 'k', # Ejective k to k (27)
'qʼ': 'q', # Ejective q to k (27)
'pʼ': 'p', # Ejective p to p (25)
'sʼ': 's', # Ejective s to s (21)
# Additional vowels
'ʏ': 'ɪ', # Near-close near-front rounded to short i (31)
'ʏː': 'y', # Long near-close near-front rounded to long y (39)
'ʊː': 'ʊ', # Long near-close near-back rounded to short u (22)
'ɤ': 'ə', # Close-mid back unrounded to schwa (18)
'ɤː': 'ə', # Long close-mid back unrounded to schwa (18)
'œː': 'ø', # Long open-mid front rounded to long oe (40)
'ɯː': 'u:', # Long close back unrounded to long u (5)
'ɛ̤': 'ɛ', # Breathy open-mid front unrounded to epsilon (6)
# Short/reduced vowels
'ĕ': 'e', # Short e to e (42)
'ă': 'a', # Short a to long a (30)
'ĭ': 'ɪ', # Short i to short i (31)
'ŏ': 'o', # Short o to o (44)
'ŭ': 'ʊ', # Short u to short u (22)
# Laryngealized/creaky vowels - map to regular vowels
'ḛ': 'e', # Creaky e to e (42)
'ḭ': 'i', # Creaky i to i (41)
'o̰': 'o', # Creaky o to o (44)
'ɛ̰': 'ɛ', # Creaky epsilon to epsilon (6)
'a̰': 'a', # Creaky a to long a (30)
'ʊ̰': 'ʊ', # Creaky upsilon to upsilon (22)
# Additional consonants
'ɦ': 'h', # Voiced h to h (37)
'ʍ': 'w', # Voiceless w to w (47)
'ɢ': 'g', # Uvular g to g (10)
'ɱ': 'm', # Labiodental nasal to m (28)
'ʔ': 'noise', # Glottal stop to noise (50)
'ɮ': 'z', # Voiced lateral fricative to z (36)
'ɸ': 'f', # Bilabial fricative to f (20)
# Co-articulated stops
'k͡p': 'k', # was 'noise', map to velar stop as it's typically more salient
'ɡ͡b': 'g', # was 'noise', map to velar stop (voiced counterpart)
'p͡t': 'p', # was 'noise', map to first stop in sequence
'b͡d': 'b', # was 'noise', map to first stop in sequence
# Lengthened consonants
'ʔː': 'q', # was 'noise', map to closest glottal/uvular stop in inventory
'hː': 'h', # was 'noise', map to plain glottal fricative
'æ̆': 'æ', # Short ae to ae (32)
'ɜ̆': 'ə', # Short epsilon to long epsilon (33)
'ɔ̆': 'ʌ', # Short open-o to long open-o (3)
'ə̠': 'ʌ', # Retracted schwa (when it appears in stressed positions)
'ə̆': 'ə', # Short schwa to schwa (18)
'ɒː': 'a:', # Long open-o to long open-o (3)
# Aspirated and modified affricates
'd͡ʒʰ': 'dʒ', # Aspirated dzh to zh (2)
't͡sʼ': 'ts', # Ejective ts to t (4)
't͡ʃʼ': 'tʃ', # Ejective tsh to sh (1)
't͡ɬʼ': 't', # Ejective tl to t (4)
't͡ʃʲ': 'tʃ', # Palatalized tsh to sh (1)
'd͡ʒʲ': 'dʒ', # Palatalized dzh to zh (2)
# Voiceless sonorants
'e̥': 'e', # Voiceless e to e (42)
'ɲ̥': 'ɲ', # Voiceless ny to ny (38)
'm̥': 'm', # Voiceless m to m (28)
'n̥': 'n', # Voiceless n to n (11)
'l̥': 'l', # Voiceless l to l (16)
'r̥': 'ɹ', # Voiceless r to r (17)
'ŋ̥': 'ŋ', # Voiceless ng to ng (34)
'i̥': 'i', # Voiceless i to i (41)
'u̥': 'u:', # Voiceless u to long u (5)
'ʎ̥': 'l', # Voiceless palatal l to l (16)
# Long consonants
'tʰː': 't', # Long aspirated t to t (4)
'çː': 'ç', # Long palatal fricative to h (37)
'xː': 'h', # Long x to h (37)
'ɟː': 'ʒ', # Long palatal stop to zh (2)
'l̪ː': 'l', # Long dental l to l (16)
'pʰː': 'p', # Long aspirated p to p (25)
'θː': 'θ', # Long th to th (46)
'ɲː': 'ɲ', # Long ny to ny (38)
'wː': 'w', # Long w to w (47)
# Modified velars
'kʰʲ': 'k', # Palatalized aspirated k to k (27)
'kʼʲ': 'k', # Palatalized ejective k to k (27)
'qʰʷ': 'q', # Labialized aspirated q to k (27)
'kʰʷ': 'k', # Labialized aspirated k to k (27)
'kʷʰ': 'k', # Labialized aspirated k to k (27)
'kʷʼ': 'k', # Labialized ejective k to k (27)
'qʷ': 'q', # Labialized q to k (27)
'qʷʼ': 'q', # Labialized ejective q to k (27)
'qʰ': 'q', # Aspirated q to k (27)
'q̠': 'q', # Retracted q to k (27)
'ɢʲ': 'g', # Palatalized uvular g to g (10)
'ɡʷ': 'g', # Labialized g to g (10)
# Rhotic vowels
'e˞': 'ɚ', # Rhotacized e to schwar (14)
'a˞': 'ɚ', # Rhotacized a to schwar (14)
'o˞': 'ɚ', # Rhotacized o to schwar (14)
'u˞': 'ɚ', # Rhotacized u to schwar (14)
'i˞': 'ɚ', # Rhotacized i to schwar (14)
# Nasalized variants
'ɛ̃ː': 'ɛ', # Long nasalized epsilon to epsilon (6)
'ʊ̃': 'ʊ', # Nasalized upsilon to upsilon (22)
'z̃': 'z', # Nasalized z to z (36)
'j̃': 'j', # Nasalized j to j (29)
'w̃': 'w', # Nasalized w to w (47)
'ʊ̰̃': 'ʊ', # Creaky nasalized upsilon to upsilon (22)
'æ̃ː': 'æ', # Long nasalized ae to ae (32)
'ɔ̃ː': 'ɔ', # Long nasalized open-o to long open-o (3)
'ɛ̰̃': 'ɛ', # Creaky nasalized epsilon to epsilon (6)
# Modified dentals/alveolars
'd̪ʰ': 'd', # Aspirated dental d to d (9)
't̪ʰ': 't', # Aspirated dental t to t (4)
't̪ʲ': 'tʲ', # Palatalized dental t to t (4)
'tʲʰ': 'tʲ', # Palatalized aspirated t to t (4)
'dʰ': 'd', # Aspirated d to d (9)
'ðʲ': 'ð', # Palatalized eth to eth (35)
'zʲ': 'z', # Palatalized z to z (36)
'zʷ': 'z', # Labialized z to z (36)
# Complex modifications
'ʃʷ': 'ʃ', # Labialized sh to sh (1)
'ɕʷ': 'ɕ', # Labialized alveolo-palatal to sh (1)
'ʑʷ': 'ʒ', # Labialized voiced alveolo-palatal to zh (2)
'ʕʷ': 'h', # Labialized pharyngeal to h (37)
'ħʷ': 'h', # Labialized voiceless pharyngeal to h (37)
'ʁʷ': 'ɹ', # Labialized uvular to r (17)
'χʲ': 'h', # Palatalized x to h (37)
'hʲ': 'h', # Palatalized h to h (37)
# Retracted/advanced variants
'ɨ̠': 'ɨ', # Retracted barred-i to barred-i (45)
'ʊ̠': 'ʊ', # Retracted upsilon to upsilon (22)
'ʊ̟': 'ʊ', # Advanced upsilon to upsilon (22)
'æ̟': 'æ', # Advanced ae to ae (32)
'ə̟': 'ə', # Advanced schwa to schwa (18)
# Dental variants
'n̪': 'n', # Dental n to n (11)
'l̪': 'l', # Dental l to l (16)
# Special vowels
'ö': 'ø', # O-umlaut to long oe (40)
'ü': 'y', # U-umlaut to long y (39)
'ʉ': 'ɨ', # Central u to long u (5)
'ɞ': 'ə', # Open-mid central rounded to schwa (18)
'ɤ̈': 'ə', # Advanced close-mid back unrounded to schwa (18)
'ɯ̈': 'ɨ', # Advanced high back unrounded
# Implosives/ejectives/glottalized
'ɗ': 'd', # Implosive d to d (9)
'ɓ': 'b', # Implosive b to b (26)
'ʄ': 'ʒ', # Implosive palatal to zh (2)
'dˀ': 'd', # Glottalized d to d (9)
'bˀ': 'b', # Glottalized b to b (26)
'ˀa': 'a', # Preglottalized a to long a (30)
# Modified retroflexes
'ʈʰ': 't', # Aspirated retroflex t to t (4)
'ɖʰ': 'd', # Aspirated retroflex d to d (9)
# Remaining special cases
'ɥ': 'j', # Labial-palatal approximant to j (29)
'ʀ': 'ɹ', # Uvular trill to r (17)
'ɹ̝': 'ɹ', # Raised r to r (17)
'ṽ': 'v', # Nasalized v to v (15)
'ə̥': 'ə', # Voiceless schwa to schwa (18)
'ə̯': 'ə', # Non-syllabic schwa to schwa (18)
'i̯': 'i', # Non-syllabic i to i (41)
'l̴': 'l', # Velarized l to l (16)
'dⁿ': 'd', # Prenasalized d to d (9)
'tⁿ': 't', # Prenasalized t to t (4)
# Breathy/creaky variants
'd̪̤': 'd', # Breathy dental d to d (9)
'ɑ̤': 'a', # Breathy long a to long a (13)
'ṳː': 'u:', # Breathy long u to long u (5)
'ṳ': 'u:', # Breathy u to long u (5)
'ɯ̤': 'u:', # Breathy unrounded u to long u (5)
'ɪ̰': 'ɪ', # Creaky short i to short i (31)
'ɔ̰': 'ɔ', # Creaky open-o to long open-o (3)
'ɔ̤': 'ɔ', # Breathy open-o to long open-o (3)
# Height/backness variants
'ɑ̝': 'a', # Raised long a to long a (13)
'ɛ̞': 'ɛ', # Lowered epsilon to epsilon (6)
'ɛ̝': 'ɛ', # Raised epsilon to epsilon (6)
'e̝': 'e', # Raised e to e (42)
'o̝': 'o', # Raised o to o (44)
'u̝': 'u:', # Raised u to long u (5)
'ɑ̞': 'a', # Lowered long a to long a (13)
'a̘': 'a', # Advanced tongue root a to long a (30)
'ä': 'a', # Centralized a to long a (30)
# Modified vowel quality
'ɛ̈': 'ɛ', # Centralized epsilon to epsilon (6)
'œ̈': 'ø', # Centralized oe to long oe (40)
'ʌ̈': 'ʌ', # Centralized wedge to schwa (18)
'ɛ̠': 'ɛ', # Retracted epsilon to epsilon (6)
'a̠': 'a', # Retracted a to long a (30)
'o̠': 'o', # Retracted o to o (44)
'i̠': 'i', # Retracted i to i (41)
# Remaining consonant variants
't̠': 't', # Retracted t to t (4)
'd̠': 'd', # Retracted d to d (9)
'n̠': 'n', # Retracted n to n (11)
't̟': 't', # Advanced t to t (4)
'r̟': 'ɹ', # Advanced r to r (17)
'r̠': 'ɹ', # Retracted r to r (17)
'rˠ': 'ɹ', # Velarized r to r (17)
'ɪ̥': 'ɪ', # Voiceless short i to short i (31)
'ʔʷ': 'noise', # Labialized glottal stop to noise (50)
'ɕʼ': 'ɕ', # Ejective alveolo-palatal to sh (1)
'cʼ': 'k', # Ejective c to k (27)
'cʷʰ': 'k', # Labialized aspirated c to k (27)
'w̝': 'w', # Raised w to w (47)
'ʃ̠': 'ʃ', # Retracted sh to sh (1)
'ɪ̰̃': 'ɪ', # Creaky nasalized short i to short i (31)
'tʷʼ': 't', # Labialized ejective t to t (4)
'ŋʲ': 'ŋ', # Palatalized ng to ng (34)
'bʰ': 'b', # Aspirated b to b (26)
'æ̈': 'æ', # Centralized ae to ae (32)
'ɘ': 'ə', # Close-mid central unrounded vowel to schwa (18)
'tsʰ': 'ts', # Aspirated ts to ts (4)
'r̩ː': 'ɚ', # Long rhotic schwa to schwar (14)
}
def get_compound_phoneme_mapping(phoneme):
# First try direct mapping
if phoneme in phoneme_mapping:
return phoneme_mapping[phoneme]
# For compound phonemes, map components and combine
mapped = ""
remaining = phoneme
while remaining:
found = False
# Try to match longest possible substring first
for i in range(len(remaining), 0, -1):
subset = remaining[:i]
if subset in phoneme_mapping:
mapped += phoneme_mapping[subset]
remaining = remaining[i:]
found = True
break
if not found:
# If no mapping found for current character, treat as noise
remaining = remaining[1:]
return mapped if mapped else "noise"
def create_normalized_mapping(mapping_dict):
# Create normalized version of the mapping
from unicodedata import normalize
"""Create a mapping dictionary with normalized Unicode characters."""
return {
normalize('NFC', key): normalize('NFC', value)
for key, value in mapping_dict.items()
}
phoneme_mapper = create_normalized_mapping(phoneme_mapping) #Both the 'key' and value ar normalized
#print(phoneme_mapper)
def analyze_phoneme_merger(phoneme_mapper):
# Check for circular references
def check_circular_refs(mapper):
issues = []
for phoneme, target in mapper.items():
if target in mapper and mapper[target] != target:
issues.append(f"Potential circular reference: {phoneme} -> {target} -> {mapper[target]}")
return issues
# Check for consistency in vowel merging
def check_vowel_consistency(mapper):
issues = []
# Common vowel pairs that should merge consistently
vowel_pairs = [
('ɑː', 'ɑːɹ'), # Long a with/without r
('ɔː', 'ɔːɹ'), # Long o with/without r
('iː', 'iə'), # Long i and i-schwa
('ʊ', 'ʊɹ'), # Short u with/without r
]
for v1, v2 in vowel_pairs:
if v1 in mapper and v2 in mapper:
if mapper[v1] != mapper[v2]:
issues.append(f"Inconsistent vowel mapping: {v1} -> {mapper[v1]} but {v2} -> {mapper[v2]}")
return issues
# Check for r-colored vowel consistency
def check_r_colored_consistency(mapper):
issues = []
r_colored = ['ɪɹ', 'ɛɹ', 'ʊɹ']
target = 'ɚ' # All should map to schwa-r
for phoneme in r_colored:
if phoneme in mapper and mapper[phoneme] != target:
issues.append(f"Inconsistent r-colored vowel: {phoneme} -> {mapper[phoneme]}, expected -> {target}")
return issues
# Check compound phoneme handling
def check_compound_handling(mapper):
issues = []
for phoneme in mapper:
if len(phoneme) > 1 and phoneme not in ['tʃ', 'dʒ', 'aɪ', 'eɪ', 'oʊ', 'aʊ', 'ɔɪ', 'iə', 'uː', 'iː', 'ɑː', 'ɔː', 'ɜː', 'əl']:
if not phoneme.startswith(mapper[phoneme][0]):
issues.append(f"Potentially incorrect compound mapping: {phoneme} -> {mapper[phoneme]}")
return issues
# Collect all issues
all_issues = []
all_issues.extend(check_circular_refs(phoneme_mapper))
all_issues.extend(check_vowel_consistency(phoneme_mapper))
all_issues.extend(check_r_colored_consistency(phoneme_mapper))
#all_issues.extend(check_compound_handling(phoneme_mapper))
print("Testing complete vocab:")
for kv in list(complete_vocab.keys()):
mapped = get_compound_phoneme_mapping(kv)
if (mapped != kv):
if (mapped == 'noise') or (complete_vocab[kv] > 5000):
print(f"{kv} -> {mapped} \tcount: {complete_vocab[kv]}")
# Verify coverage
missing_phonemes = set(complete_vocab.keys()) - set(phoneme_mapping.keys())
print(f"Missing phonemes: {missing_phonemes}")
for phoneme in missing_phonemes:
print(f"{phoneme} -> {complete_vocab[phoneme]}")
return all_issues
def create_new_index():
# First, count the frequencies mapping to the new phonemes (count merged branches)
phoneme_vocab_mapped_counts = {}
for key, value in phoneme_mapper.items():
if value not in phoneme_vocab_mapped_counts:
phoneme_vocab_mapped_counts[value] = 0
phoneme_vocab_mapped_counts[value] += 1
print(f"Mapped: {len(phoneme_mapper)} phonemes onto {len(phoneme_vocab_mapped_counts)} phonemes")
#print(phoneme_mapper)
# Sort phonemes by frequency in descending order, excluding SIL
sorted_phonemes = sorted(
[p for p in phoneme_vocab_mapped_counts.keys() if p not in ['SIL', 'noise']],
key=lambda x: phoneme_vocab_mapped_counts[x],
reverse=True
)
# Create the index mapping
phoneme_mapped_index = {}
# Put SIL at index 0
phoneme_mapped_index['SIL'] = 0
# Add the rest of the phonemes with indices starting from 1
for i, phoneme in enumerate(sorted_phonemes):
phoneme_mapped_index[phoneme] = i + 1
# Put noise at the last index
phoneme_mapped_index['noise'] = len(sorted_phonemes) + 1
print("New index created:")
print(phoneme_mapped_index)
print("Unique phonemes in the new index:")
print(list(phoneme_mapped_index.keys()))
# Run the analysis
issues = analyze_phoneme_merger(phoneme_mapper)
# Print findings
print("Found the following potential issues:")
for i, issue in enumerate(issues, 1):
print(f"{i}. {issue}")
# Additional validation of the phoneme_mapped_index
mapped_phonemes = set(phoneme_mapped_index.keys())
merger_outputs = set(p for p in phoneme_mapper.values() if not p.endswith('*'))
missing_indices = merger_outputs - mapped_phonemes
extra_indices = mapped_phonemes - merger_outputs
print("\nIndex validation:")
if missing_indices:
print(f"Merged phonemes missing from index: {missing_indices}")
if extra_indices:
print(f"Extra phonemes in index: {extra_indices}")
print("Done")
def check_missing_phonemes():
test_phonemes = ['a', 'd͡ʒ', 'ʃʲ', 'm', 'ɜ', 'ɘ', 'ʃ', 't͡ʃʰ', 'r', 'ä', 't͡ʃ', 'ə̆', 'pʰ', 'ɜ̆', 'ʌ̈', 't', 'ʃʰ', 'kʼ', 'ʒʲ', 'ə', 'ă', 'b', 'ɨ', 'æ̈', 'j', 'ɛ̈', 'p', 'd', 'n', 'ɥ', 'ɡ', 't͡ʃʼ', 'χ', 'ˀa', 'ʒ', 'ħʷ', 'ɹ', 'ħ', 'œ̈', 'ɾ', 'ʁ', 'ɤ̈', 'z', 'i', 'χʲ', 'tʰ', 's', 'ʁʷ', 'h', 'ɛ', 'k', 'ɑ', 'x', 'ɔ', 'o', 'u', 'e', 'ɑ̃', 'ŋ', 'l', 'ʊ', 'ã', 'q̠', 'õ', 'w', 'β', 'f', 'v', 'ʎ', 'oː', 'eː', 'kʰ', 'ð', 'œ', 'ɹ̩', 'ɛ̝', 'ʔ', 'l̥', 'e̝', 'aː', 'uː', 'iː', 'ʌ̃', 'æ', 'ẽ', 'y', 'yː', 'ɪː', 'ɛː', 'øː', 'œː', 'ɑː', 'o̝', 'ʌ', 'ø', 'ɯ', 'sː', 'ɛ̃', 'c', 'ɪ', 'ɟ', 'ɲ', 'æː', 'æ̃ː', 'ʉ', 'ɫ̩', 'ʋ', 'ɫ', 'kʲ', 'ɣ', 'ɦ', 'n̩', 'ɸ', 'dʰ', 'm̩', 'h̩', 'ç', 'bʰ', 't̪', 'd̪', 'd̪̤', 'b̤', 'n̪', 'ĩ', 'ũː', 'ũ', 'j̤', 'l̪', 'pː', 'kː', 'rː', 'nː', 'l̪ː', 'bː', 'mː', 'ɞ', 't̪ʲ', 'hː', 'ʔː', 'tː', 'dː', 'ʈ', 'ɖ', 'ʂ', 'ʐ', 'r̥', 'ɔː', 'ʏː', 'ʏ', 'θ', 'n̥', 'cː', 'ɟː', 'fː', 'lː', 'ŋ̥', 'ə̯', 'ə̟', 'i̯', 'ʊ̟', 'ɛ̞', 'ʊ̠', 'r̟', 'r̠', 'ɕ', 'pʲ', 'bʲ', 'ŭ', 'tʲ', 'ĕ', 'dʲ', 'ɡʲ', 'nʲ', 'fʲ', 'zʲ', 'vʲ', 'lʲ', 'sʲ', 'xʲ', 'hʲ', 'ŏ', 'mʲ', 't͡ʃʲ', 'd͡ʒʲ', 'æ̆', 'ŋʲ', 'rʲ', 'ɾʲ', 'ĭ', 'ɔ̆', 's̪', 'ɱ', 'ɽ', 'ɳ', 'ʈʰ', 'ɖʰ', 'ɵ', 't̪ʰ', 'd͡ʒʰ', 'ɭ', 'ʊ̃', 'sʰ', 'ḭ', 'cʰ', 'ʊ̰', 'ɛ̰', 'ɪ̰', 'a̰', 'ḛ', 'o̰', 'ɛ̰̃', 'ɪ̃', 'ʊ̰̃', 'ɲ̥', 'æ̃', 'm̥', 'ɪ̰̃', 'ɔ̰', 'wː', 'ɔ̃ː', 'ɗ', 'ɔ̃', 'õː', 'ɯː', 'ə̃', 'tʰː', 'pʰː', 'vː', 'zː', 'ʃː', 'jː', 'ɲː', 'xː', 'çː', 'ɓ', 'ãː', 't͡sʼ', 'ɻ', 'ʀ', 't͡s', 'a', 'b', 'w', 'e', 'ɔ', 'p', 'ɛ', 't', 'o', 't͡ʃ', 'u', 'd', 'k', 'ɔ̃', 'kʷ', 'ɡ', 'k͡p', 'm', 'n', 'n̠', 'j', 'f', 's', 'ç', 'ɹ', 'l', 'i', 'ʍ', 'd̠', 'ʐ', 'ŋ', 'ɥ', 't̠', 'ɕʷ', 'ɕ', 'pʰ', 'tʰ', 'sʰ', 'kʰ', 'z', 'ä', 'h', 'v', 'ʃ', 'ʒ', 'r', 'ü', 'y', 'ʔ', 'ɪ', 'æ', 'ə', 'q̠', 'ɞ', 't͡ʃʰ', 'ĩ', 'ã', 'õ', 'ʋ', 'x', 'ɾ', 'ɓ', 'ɗ', 'c', 'ɟ', 'ʄ', 'aː', 'ɲ', 'ɔː', 'tʲ', 'oː', 'ɤː', 'uː', 'ʊː', 'ɳ', 'ɯː', 'ðʲ', 'tʲʰ', 'ɛ̃', 'ɣ', 'kʲ', 'ũ', 'ĩː', 'rˠ', 'ɛ̃ː', 'ãː', 'ɔ̃ː', 'ũː', 't̪', 'ʑʷ', 'ʑ', 'ɡʷ', 'ŋʷ', 'ɽ', 'o̠', 'w̃', 'ɯ', 'ö', 'ɡ͡b', 'd͡ʒ', 'ʁ', 'q', 'i̠', 'ɛ̠', 'v̩', 'l̥', 'ɤ', 'r̥', 'ɢ', 'ɢʲ', 'χ', 'kʰʲ', 'm̥', 'n̥', 'nː', 'pː', 'lː', 'rː', 'æː', 'eː', 'o˞', 'e˞', 'a˞', 'i˞', 'iː', 'u˞', 'ʕʷ', 'ʕ', 'xʷ', 'ɬ', 'qʷ', 'ɑ', 'ɪ̃', 'ẽ', 'ʊ', 'd̪', 'd͡ʒʰ', 'ɦ', 't̪ʰ', 'd̪ʰ', 'dʰ', 'bʰ', 'ʌ', 'pʼ', 'ʊ̃', 'kʼ', 'β', 'kʼʲ', 'ħ', 'qʼ', 'cʼ', 'kʰʷ', 'qʰʷ', 'ɨ', 'ð', 'ɖ', 'ɸ', 'ʏ', 'ø', 'l̩', 'dʷ', 'pʷ', 'bʷ', 'tʷ', 'ṽ', 'z̃', 'ʃʷ', 'ʒʷ', 'a̘', 't͡s', 'n̤', 'ŋ̩', 'h̩', 'ɹ̝', 'ɑː', 'ɑ̞', 'ɑ̝', 'ɛː', 'ɪː', 'u̝', 'sʲ', 'ɜ', 'ɨː', 'θ', 'l̴', 'n̩', 'j̃', 't͡ɬ', 'sʼ', 'kʷʼ', 'cʰ', 'qʷʼ', 'zʷ', 'qʰ', 'kʷʰ', 't͡ɬʼ', 'cʷʰ', 'ʁʷ', 'tʷʼ', 'a̤', 'ɔ̤', 'o̤ː', 'i̤ː', 'ṳ', 'o̤', 'ṳː', 'ɯ̤', 'tʼ', 'ɑ̃', 'ɫ', 'ɑ̤', 'ʌ̃', 'ɛ̤', 'p͡t', 'b͡d', 'mʷ', 'w̝', 'ʎ̥', 'ɮ', 'ʃ̠', 'fː', 'i̥', 'u̥', 'ɪ̥', 'zː', 'sː', 'ʎ', 'ə̥', 'ʃː', 'e̥', 'ỹ', 'ɯ̈', 'ʉ', 'ɒ', 'xː', 'l̪', 'n̪', 'θː', 'ɒː', 'dˀ', 'bˀ', 't̟', 'æ̟', 'dⁿ', 'ɨ̠', 'tⁿ', 'a̠', 't͡sʰ', 'ɕʰ', 'm̩', 'ɭ', 'ə̃', 'ɕʼ', 't͡ʃʼ', 'ʔʷ', 'tsʰ'] # from UCLA phonetics, some repeated
missing_phonemes = set(test_phonemes) - set(phoneme_mapper.keys())
print(f"Missing phonemes: {missing_phonemes}")
print(len(missing_phonemes))
# list of phonemes that map to noise:
noise_phonemes = [k for k, v in phoneme_mapper.items() if v == 'noise']
noise_phonemes_in_test_set = set(noise_phonemes) & set(test_phonemes)
print(f"Noise phonemes in test set: {noise_phonemes_in_test_set}")
# only {'ʔ', 'ʔʷ'} are mapped to noise from ucla dataset
def check_duplicates():
from collections import defaultdict
# Create a dictionary to store the key-value pairs
key_value_pairs = defaultdict(set)
# Populate the key-value pairs
for key, value in phoneme_mapper.items():
key_value_pairs[key].add(value)
# Find and print keys with multiple different values
duplicates = {key: values for key, values in key_value_pairs.items() if len(values) > 1}
print("Duplicate keys with different values:", len(duplicates))
for key, values in duplicates.items():
print(f"Key '{key}' has different values: {values}")
def make_phoneme_groups():
phoneme_groups_19 = {
# Vowels - Separated by height and frontness
"high_front_vowels": ["i", "i:", "ɪ", "y", "ʏ", "iː"],
"high_back_vowels": ["u", "u:", "ʊ", "ɯ", "ʉ", "ɨ", "uː"],
"mid_front_vowels": ["e", "e:", "ɛ", "ø", "œ", "eː"],
"mid_central_vowels": ["ə", "ɜ", "ɜ:", "ɚ", "ʌ", "ɘ", "ɵ"],
"mid_back_vowels": ["o", "o:", "ɔ", "ɔ:", "ɤ", "oː"],
"low_vowels": ["a", "a:", "æ", "ɐ", "ɑ", "ɑ:", "ɒ", "aː"],
"diphthongs": ["aɪ", "eɪ", "ɔɪ", "aʊ", "oʊ", "ɛə", "ɪə", "ʊə"],
# Consonants - Organized by manner and voicing
"voiceless_stops": ["p", "t", "k", "q", "ʔ", "ʈ", "c"],
"voiced_stops": ["b", "d", "g", "ɢ", "ɖ", "ɟ"],
"voiceless_fricatives": ["f", "θ", "s", "ʃ", "ç", "x", "h", "ħ", "ʂ", "ɕ", "χ"],
"voiced_fricatives": ["v", "ð", "z", "ʒ", "ʝ", "ɣ", "ʕ", "ʐ", "ʑ", "ʁ"],
"voiceless_affricates": ["ts", "tʃ", "tɕ", "ʈʂ"],
"voiced_affricates": ["dz", "dʒ", "dʑ", "ɖʐ"],
"nasals": ["m", "n", "ɲ", "ŋ", "ɴ", "ɱ", "ɳ"],
# Liquids, glides, and palatalized sounds
"laterals": ["l", "ɭ", "ʎ", "ʟ"],
"rhotics": ["r", "ɾ", "ɹ", "ʀ", "ɽ", "ɻ"],
"glides": ["j", "w", "ɥ", "ɰ"],
"palatalized": ["ɭʲ", "rʲ", "tʲ", "nʲ"],
"SIL": ["SIL"],
"noise": ["noise"],
}
phoneme_groups = {
# Vowels - Adjusted based on confusion patterns
"front_vowels": ["i", "i:", "ɪ", "y", "ʏ", "iː", "e", "e:", "ɛ", "ø", "œ", "eː"], # Merged high/mid front
"central_vowels": ["ə", "ɜ", "ɜ:", "ɚ", "ʌ", "ɘ", "ɵ"], # Keep central vowels separate
"back_vowels": ["u", "u:", "ʊ", "ɯ", "ʉ", "ɨ", "uː", "o", "o:", "ɔ", "ɔ:", "ɤ", "oː"], # Merged high/mid back
"low_vowels": ["a", "a:", "æ", "ɐ", "ɑ", "ɑ:", "ɒ", "aː"], # Keep low vowels separate
"diphthongs": ["aɪ", "eɪ", "ɔɪ", "aʊ", "oʊ", "ɛə", "ɪə", "ʊə"], # Keep diphthongs separate
# Consonants - Maintain voicing distinction for stops and fricatives
"voiceless_stops": ["p", "t", "k", "q", "ʔ", "ʈ", "c", "tʲ"], # Add palatalized t
"voiced_stops": ["b", "d", "g", "ɢ", "ɖ", "ɟ"],
"voiceless_fricatives": ["f", "θ", "s", "ʃ", "ç", "x", "h", "ħ", "ʂ", "ɕ", "χ"],
"voiced_fricatives": ["v", "ð", "z", "ʒ", "ʝ", "ɣ", "ʕ", "ʐ", "ʑ", "ʁ"],
# Keep affricates distinction by voicing
"voiceless_affricates": ["ts", "tʃ", "tɕ", "ʈʂ"],
"voiced_affricates": ["dz", "dʒ", "dʑ", "ɖʐ"],
# Merge palatalized nasals with base nasals
"nasals": ["m", "n", "nʲ", "ɲ", "ŋ", "ɴ", "ɱ", "ɳ"],
# Merge palatalized laterals with base laterals
"laterals": ["l", "ɭ", "ɭʲ", "ʎ", "ʟ"],
# Merge palatalized rhotics with base rhotics
"rhotics": ["r", "rʲ", "ɾ", "ɹ", "ʀ", "ɽ", "ɻ"],
# Keep glides separate
"glides": ["j", "w", "ɥ", "ɰ"],
# Special tokens
"SIL": ["SIL"],
"noise": ["noise"],
}
# verify groups cover all phonemes
phoneme_groups_flat = [p for g in phoneme_groups for p in phoneme_groups[g]]
extra_phonemes = set(phoneme_groups_flat)- set(phoneme_mapped_index.keys())
print(f"extra phonemes: {extra_phonemes}")
missing_phonemes = set(phoneme_mapped_index.keys()) - set(phoneme_groups_flat)
print(f"missing phonemes: {missing_phonemes}")
assert len(missing_phonemes) == 0, "Phoneme groups do not cover all phonemes"
# remove extra phonemes:
for p in extra_phonemes:
for g in phoneme_groups:
if p in phoneme_groups[g]:
phoneme_groups[g].remove(p)
# covert groups to index
phoneme_groups_based = {}
for g in phoneme_groups:
phoneme_groups_based[g] = [phoneme_mapped_index[p] for p in phoneme_groups[g]]
# verify groups are correctly mapped
for g in phoneme_groups:
for p in phoneme_groups[g]:
assert phoneme_mapped_index[p] in phoneme_groups_based[g], f"{p} not in {g}"
global phoneme_groups_index
# clear
phoneme_groups_index = {}
phoneme_groups_index = { "SIL": 0,}
for i, g in enumerate(phoneme_groups):
if (g != "SIL") and (g != "noise"):
phoneme_groups_index[g] = i+1
phoneme_groups_index["noise"] = len(phoneme_groups_index)
print("phoneme_groups_index:", phoneme_groups_index)
print("total groups (excluding noise)", len(phoneme_groups_index)-1)
# base phonemes index to group index
base66_to_groups = {}
for p in phoneme_mapped_index:
for g in phoneme_groups:
if p in phoneme_groups[g]:
base66_to_groups[phoneme_mapped_index[p]] = phoneme_groups_index[g]
# verify all phonemes are mapped to a group
assert len(base66_to_groups) == len(phoneme_mapped_index), "Not all phonemes are mapped to a group"
print("base66_to_groups:", base66_to_groups)
#main
if __name__ == "__main__":
# Create the new index
#create_new_index()
#check_missing_phonemes()
#check_duplicates()
make_phoneme_groups()