# this is a prepared index generated from the create_new_index() function phoneme_mapped_index = { # Special token 'SIL': 0, # High front vowels and commonly confused similar vowels 'i': 1, # High front unrounded 'i:': 2, # Long high front unrounded 'ɨ': 3, # High central (grouped here due to high confusion with 'i') 'ɪ': 4, # Near-high front unrounded # Mid front vowels 'e': 5, # Mid front unrounded 'e:': 6, # Long mid front unrounded 'ɛ': 7, # Open-mid front unrounded # Central vowels 'ə': 8, # Schwa (mid central) 'ɚ': 9, # R-colored schwa 'ʌ': 10, # Open-mid back unrounded # Back vowels 'u': 11, # High back rounded 'u:': 12, # Long high back rounded 'ʊ': 13, # Near-high back rounded 'ɯ': 14, # High back unrounded 'o': 15, # Mid back rounded 'o:': 16, # Long mid back rounded 'ɔ': 17, # Open-mid back rounded # Low vowels 'a': 18, # Open central/front unrounded 'a:': 19, # Long open central/front unrounded 'æ': 20, # Near-open front unrounded # Front rounded vowels 'y': 21, # High front rounded 'ø': 22, # Mid front rounded # Diphthongs 'aɪ': 23, # Open central to high front 'eɪ': 24, # Mid front to high front 'aʊ': 25, # Open central to high back 'oʊ': 26, # Mid back to high back 'ɔɪ': 27, # Open-mid back to high front # Stops (organized by place of articulation) 'p': 28, # Voiceless bilabial 'b': 29, # Voiced bilabial 't': 30, # Voiceless alveolar 'd': 31, # Voiced alveolar 'k': 32, # Voiceless velar 'g': 33, # Voiced velar 'q': 34, # Voiceless uvular # Affricates and related sibilant fricatives (grouped by similarity) 'ts': 35, # Voiceless alveolar affricate 's': 36, # Voiceless alveolar fricative 'z': 37, # Voiced alveolar fricative 'tʃ': 38, # Voiceless postalveolar affricate 'dʒ': 39, # Voiced postalveolar affricate 'ʃ': 40, # Voiceless postalveolar fricative 'ʒ': 41, # Voiced postalveolar fricative 'ɕ': 42, # Voiceless alveolo-palatal fricative # Other fricatives (organized by place) 'f': 43, # Voiceless labiodental 'v': 44, # Voiced labiodental 'θ': 45, # Voiceless dental 'ð': 46, # Voiced dental 'ç': 47, # Voiceless palatal 'x': 48, # Voiceless velar 'ɣ': 49, # Voiced velar 'h': 50, # Voiceless glottal 'ʁ': 51, # Voiced uvular # Nasals (organized by place) 'm': 52, # Bilabial 'n': 53, # Alveolar 'ɲ': 54, # Palatal 'ŋ': 55, # Velar # Liquids and approximants 'l': 56, # Alveolar lateral 'ɭ': 57, # Retroflex lateral 'ɾ': 58, # Alveolar tap 'ɹ': 59, # Alveolar approximant 'j': 60, # Palatal approximant 'w': 61, # Labial-velar approximant # Palatalized consonants 'tʲ': 62, # Palatalized t 'nʲ': 63, # Palatalized n 'rʲ': 64, # Palatalized r 'ɭʲ': 65, # Palatalized retroflex lateral # Special token 'noise': 66 } phoneme_groups_mapper = {0: 0, 1: 1, 2: 1, 3: 3, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 3, 12: 3, 13: 3, 14: 3, 15: 3, 16: 3, 17: 3, 18: 4, 19: 4, 20: 4, 21: 1, 22: 1, 23: 5, 24: 5, 25: 5, 26: 5, 27: 5, 28: 6, 29: 7, 30: 6, 31: 7, 32: 6, 33: 7, 34: 6, 35: 10, 36: 8, 37: 9, 38: 10, 39: 11, 40: 8, 41: 9, 42: 8, 43: 8, 44: 9, 45: 8, 46: 9, 47: 8, 48: 8, 49: 9, 50: 8, 51: 9, 52: 12, 53: 12, 54: 12, 55: 12, 56: 13, 57: 13, 58: 14, 59: 14, 60: 15, 61: 15, 62: 6, 63: 12, 64: 14, 65: 13, 66: 16} phoneme_groups_index = {'SIL': 0, 'front_vowels': 1, 'central_vowels': 2, 'back_vowels': 3, 'low_vowels': 4, 'diphthongs': 5, 'voiceless_stops': 6, 'voiced_stops': 7, 'voiceless_fricatives': 8, 'voiced_fricatives': 9, 'voiceless_affricates': 10, 'voiced_affricates': 11, 'nasals': 12, 'laterals': 13, 'rhotics': 14, 'glides': 15, 'noise': 16} # vocab counts from train_100h*8_langs, OpenSLR-MLS data 100hours each complete_vocab = {'SIL': 4914031, 'a': 1604572, 'n': 1501496, 't': 1345451, 's': 1242856, 'i': 1207390, 'e': 985568, 'o': 850470, 'm': 840466, 'l': 840200, 'r': 825931, 'd': 821689, 'k': 814493, 'ɛ': 700232, 'p': 607786, 'ə': 492948, 'v': 432914, 'j': 430514, 'u': 422499, 'ɾ': 419723, 'b': 413539, 'ɑ': 399576, 'ɔ': 344276, 'ʌ': 334025, 'ɪ': 294767, 'f': 292469, 'z': 286215, 'ɡ': 267606, 'ʃ': 249935, 'ɐ': 247989, 'w': 222719, 'ʊ': 200737, 'h': 189391, 'ʁ': 161395, 'ð': 159285, 'ɨ': 157902, 'x': 151422, 'eː': 141335, 'y': 138328, 'iː': 135167, 'ŋ': 118468, 'aɪ': 104170, 'ts': 96727, 'ɹ': 96111, 'æ': 83801, 'tʃ': 83484, 'θ': 81846, 'ʒ': 81251, 'uː': 79788, 'aː': 69487, 'ɕ': 68197, 'β': 67991, 'oː': 67190, 'ɑː': 66515, 'ɣ': 64902, 'eɪ': 63049, 'tʲ': 60946, 'ø': 58520, 'ɭ': 58455, 'nʲ': 56439, 'dʒ': 54175, 'ɑ̃': 53516, 'aʊ': 53297, 'q': 49319, 'ɲ': 48479, 'rʲ': 46853, 'ɭʲ': 45521, 'ɔ̃': 44985, 'ɯ': 43339, 'sʲ': 38535, 'ɲʲ': 38014, 'ɒ': 37682, 'vʲ': 37231, 'ʎ': 37145, 'ç': 35610, 'ʋ': 32705, 'ɚ': 32019, 'tɕ': 32006, 'mʲ': 31189, 'dʲ': 29989, 'ɜ': 27714, 'ja': 27523, 'ʔ': 26931, 'oʊ': 26398, 'ɑɨ': 24604, 'tʃʲ': 24301, '1': 23766, 'dʑ': 22131, 'ɛɪ': 21834, 'tː': 21127, 'ᵻ': 20631, 'ɛ̃': 20508, 'uɨ': 20396, 'ɫ': 19763, 'ɬ': 19662, 'ʑ': 18169, 'œ': 17838, 'oɪ': 16897, 'ɔɨ': 16178, 'ɔː': 15555, 'ɐ̃': 15510, 'ɨː': 15065, 'ɜː': 14661, 'ju': 14500, 'pʲ': 14429, 'aɨ': 13971, 'əl': 13858, 'ɵ': 13512, 'kʲ': 13204, 'ss': 12685, 'ɐ̃ʊ̃': 12237, 't[': 12211, 'əɪ': 12209, 'ɑːɹ': 10053, 'bʲ': 9958, 'd[': 9275, 'yː': 9033, 'eʊ': 8931, 'ɨu': 7653, 'ɡʲ': 7633, 'ɔːɹ': 7072, '(en)': 6245, 'œy': 6018, 'kː': 5801, 'əɨ': 5629, 'ɔø': 5613, 'oːɹ': 5458, 'u"': 5356, 'fʲ': 5315, 'pː': 5307, 'ɛɹ': 5299, 'ɪː': 5068, '??': 5027, 'ɛː': 4894, 'øː': 4796, 'ɔɪ': 4603, 'dZ': 4345, 'ɪu': 4320, 'c': 4200, 'S': 4197, 'ʕ': 4050, '(fr)': 3885, 'ʌʊ': 3787, 'tS': 3731, 'oe': 3730, 'iə': 3654, 'dʒː': 3441, 'ɪɹ': 3101, 'r̝̊': 3099, 'bː': 3051, 'ɟ': 2761, 'uɪ': 2632, 'ʊɹ': 2589, 'tʃː': 2564, 'ħ': 2318, 'ũ': 2158, 's^': 2080, 't̪': 1778, 'r̝': 1702, 'ɪ^': 1651, 'tsː': 1558, 'dzː': 1479, 'r̩': 1272, 'u:': 1189, 'aɪɚ': 1180, '(de)': 1166, 's̪': 1070, 'dz': 1027, 'iʊ': 940, 'aɪə': 928, 'dˤ': 920, 'χ': 845, 'æi': 819, 'œ̃': 766, '(it)': 719, 'ɑ:': 715, 'o:': 623, 'n̩': 587, 'l̩': 536, 'æː': 534, 'dː': 532, 'õ': 491, 'N': 490, 'y:': 415, 'pf': 342, 'əʊ': 342, 'ʝ': 323, 't^': 288, 'oe:': 257, '(nl)': 237, 'ɛʊ': 237, '(ptpt)': 196, 'e:': 183, 'eə': 144, 'd^': 131, 'i.ː': 129, 'yʊ': 101, 't^ː': 79, 'nl': 76, '(fa)': 65, 'æiː': 64, 'yi': 63, '(es)': 61, 'dʒʲ': 47, 'qː': 43, '(ru)': 40, 'ɡː': 37, 'ɪuː': 37, 'ʊə': 34, 'X': 31, 'a.ː': 31, 'u.ː': 31, 'rr': 25, 'mb': 25, 'ɵː': 24, 'd̪w': 23, 'ʂ': 22, '(tt)': 22, 'dm': 20, 'daː': 17, 'əː': 17, 'it': 17, 'ɡd': 17, 'mj': 16, 'db': 16, 'wb': 16, 'iːː': 15, 'mt': 15, 'ɑk': 15, 'i:': 15, 'da': 14, 'nb': 14, 'eð': 13, 'mx': 13, 'maː': 13, 'tk': 13, 'niː': 13, 'rb': 13, 'mh': 13, 'dˤdˤ': 13, 'fm': 13, 'nm': 11, 'eːh': 11, 'mtʃ': 11, 'ma': 11, 'ʊːt': 11, 'aːn': 11, 'iːe': 11, 'im': 10, 'eːb': 10, 'np': 10, 'aɪaɪ': 9, 'ʃj': 9, 'eːt': 9, 'jː': 8, 'mv': 8, 'ae': 8, 'ed': 8, 'nn': 8, 'dtʃ': 8, 'ɑh': 8, 'tr': 7, 'sb': 7, 'tn': 7, 'lx': 7, 'eːa': 7, 'il': 7, 'ɑj': 7, 'ɑaː': 7, 'oːs': 7, 'eːq': 7, 'ah': 7, 'bb': 7, 'or': 7, 'ɑm': 7, 'kt': 7, 'as': 7, 'eβ': 6, 'eːd': 6, 'ob': 6, 'eːuː': 6, 'rtʃ': 6, 'ʃd': 6, 'md': 6, 'duː': 6, 'eːaː': 6, 'ɡz': 6, 'hx': 6, 'lk': 6, 'eːf': 6, 'ɑq': 6, 'nk': 6, 'nx': 6, 'nj': 6, 'dɔ': 6, 'is': 6, 'aɪp': 6, 'əm': 6, 'laː': 5, 'tb': 5, 'aa': 5, 'zm': 5, 'ntʃ': 5, 'ep': 5, 'bh': 5, 'lh': 5, 'do': 5, 'eːp': 5, 'miː': 5, 'tuː': 5, 'lm': 5, 'mr': 5, 'sz': 5, 'nv': 4, 'in': 4, 'ɑuː': 4, 'iaː': 4, 'oːb': 4, 'uːm': 4, 'naː': 4, 'eːs': 4, 'itʃ': 4, 'eːv': 4, 'az': 4, 'ha': 4, 'dɡ': 4, 'hb': 4, 'mf': 4, 'ɑn': 4, 'ia': 4, 'lb': 4, 'na': 4, 'ld': 4, 'd̪': 4, 'ndʒ': 3, 'uːk': 3, 'uːb': 3, 'mo': 3, 'ɡh': 3, 'mk': 3, 'dk': 3, 'eːtʃ': 3, 'dp': 3, 'tiː': 3, 'bv': 3, 'ta': 3, 'th': 3, 'ip': 3, 'eːj': 3, 'eːx': 3, 'mz': 3, 'eːz': 3, '(pl)': 2, 'diː': 2, 'eh': 2, 'biː': 2, 'mʃ': 2, 'mn': 2, 'dʒv': 2, 'hʃ': 2, 'liː': 2, 'mɡ': 2, 'jiː': 2, 'ms': 2, 'ʃz': 2, 'ne': 2, 'on': 2, 'raː': 2, 'nh': 2, 'eːʃ': 2, 'rm': 2, 'ʊː': 2, 'laɪ': 1, 'ɑa': 1, 'td': 1, 'eːe': 1, 'mp': 1, 'ɑt': 1, 'nr': 1, 'me': 1, 'ɑo': 1, 'ik': 1, 'iːv': 1, 'dh': 1, 'eːɑ': 1, 'dl': 1, 'ʃdʒ': 1, 'ns': 1} base_phonemes = [ # New: 'x', 'ç', 'ɣ', 'ʁ', 'ts', 'tʃ', 'ʌ', 'ɭ', # Stops 'p', 'b', 't', 'd', 'k', 'ɡ', 'ʔ', # Fricatives 'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'ʂ', 'ʐ', 'ɕ', 'ʑ', 'ʁ', # Affricates 'ts', 'dz', 'tɕ', 'dʑ', 'tʃ', 'dʒ', 'pf', # Nasals 'm', 'n', 'ŋ', # Liquids/Glides 'l', 'ɫ', 'ɹ', 'j', 'w', 'ɾ', # Palatalized 'nʲ', 'lʲ', 'rʲ', # Vowels 'i', 'iː', 'ɪ', 'e', 'eː', 'ɛ', 'æ', 'a', 'aː', 'ɑː', 'o', 'ɔː', 'ʊ', 'u', 'uː', 'ə', 'ɚ', 'ʌ', 'ɨ', 'ɤ', 'ɯ', 'œ', 'ø', 'øː', 'ʏ', 'yː', # Diphthongs 'eɪ', 'aɪ', 'ɔɪ', 'oʊ', 'aʊ', # Other 'ɒ', 'ɜː', 'ɲ', 'noise', 'SIL' ] phoneme_mapping = { # Core vowels - simplified based on confusion patterns 'ə': 'ə', #'ʌ': 'ə', # Merge due to high confusion 'ʌ': 'ʌ', # didn't work well before but still keep it 'ɪ': 'ɪ', 'i': 'i', 'iː': 'i:', 'ʊ': 'ʊ', 'u': 'u', 'uː': 'u:', 'ɛ': 'ɛ', 'e': 'e', 'eː': 'e:', 'ɔː': 'ɔ', 'ɔ': 'ɔ', #'ɒ': 'ɒ', # Merge to 'a' due to 100% wrong predictions in confusion matrix (23 Jan) 'ɒ': 'a', 'æ': 'æ', # DO NOT merge 'ɑː': 'a:', 'ɑ': 'a', 'a': 'a', 'ɜː': 'ʌ', 'ɜ': 'ʌ', 'ɚ': 'ɚ', 'o': 'o', 'ɨ': 'ɨ', # Common diphthongs - keep distinct ones 'eɪ': 'eɪ', 'aɪ': 'aɪ', 'ɔɪ': 'ɔɪ', 'aʊ': 'aʊ', 'oʊ': 'oʊ', # Less common diphthongs - map to similar common ones 'ʌʊ': 'aʊ', 'eʊ': 'aʊ', 'ɛʊ': 'aʊ', 'əʊ': 'oʊ', 'ɛɪ': 'eɪ', 'ʊɪ': 'aɪ', 'ea': 'eɪ', 'aʊ̯': 'aʊ', 'aɪ̯': 'aɪ', 'ɔʏ̯': 'ɔɪ', # Core consonants 'p': 'p', 'b': 'b', 't': 't', 'd': 'd', 'k': 'k', 'g': 'g', 'm': 'm', 'n': 'n', 'ŋ': 'ŋ', 'f': 'f', 'v': 'v', 'θ': 'θ', 'ð': 'ð', 's': 's', 'z': 'z', 'ʃ': 'ʃ', 'ʒ': 'ʒ', 'h': 'h', 'l': 'l', 'ɹ': 'ɹ', 'j': 'j', 'w': 'w', 'ɲ': 'ɲ', 'ɾ': 'ɾ', # Consonant mergers based on confusion # 'ɣ': 'g', # Merge with closest stop 'ɣ': 'ɣ', # emprically confused but will keep it #'ʁ': 'ɹ', # Map to rhotic 'ʁ': 'ʁ', 'r': 'ɹ', # Map to rhotic #'x': 'h', # Map to closest fricative 'x': 'x', #'ç': 'ʃ', # Map to closest fricative #'ç': 's', # Based on empirical confusion 'ç': 'ç', 'ʂ': 'ʃ', # Map to closest fricative 'ʐ': 'ʒ', # Map to closest fricative #'ɕ': 'ʃ', # Map to closest fricative 'ɕ': 'ɕ', # keep it 'ʑ': 'ʒ', # Map to closest fricative # Simplify affricates to their primary component #'ts': 't', 'ts': 'ts', 'dz': 'dʒ', #'tʃ': 'ʃ', 'tʃ': 'tʃ', #'dʒ': 'ʒ', 'dʒ': 'dʒ', 'tɕ': 'tʃ', 'dʑ': 'dʒ', 'pf': 'f', #'tʲ': 't', 'tʲ': 'tʲ', # high freuqncy, keep it #'nʲ': 'n', 'nʲ': 'nʲ', # high freuqncy, keep it #'rʲ': 'ɹ', 'rʲ': 'rʲ', # high freuqncy, keep it # Remove palatalization 'lʲ': 'l', 'dʲ': 'd', 'sʲ': 's', 'vʲ': 'v', 'fʲ': 'f', 'mʲ': 'm', 'pʲ': 'p', 'kʲ': 'k', 'bʲ': 'b', 'ɲʲ': 'ɲ', 'dʒʲ': 'dʒ', # Simplify geminate consonants 'tː': 't', 'dː': 'd', 'kː': 'k', 'gː': 'g', 'pː': 'p', 'bː': 'b', 'fː': 'f', 'vː': 'v', 'sː': 's', 'zː': 'z', 'ʃː': 'ʃ', 'ʒː': 'ʒ', 'mː': 'm', 'nː': 'n', 'ŋː': 'ŋ', 'lː': 'l', 'rː': 'ɹ', 'jː': 'j', # Nasal vowels to oral counterparts 'ɑ̃': 'a', 'ɛ̃': 'ɛ', 'ɔ̃': 'ɔ', 'ũ': 'u', 'õ': 'oʊ', 'ɐ̃': 'ʌ', # R-colored vowels 'ɑːɹ': 'ɚ', 'ɔːɹ': 'ɚ', 'ʊɹ': 'ɚ', 'ɪɹ': 'ɚ', 'ɛɹ': 'ɚ', 'oːɹ': 'ɚ', # Vowel sequences 'ia': 'i:', 'ua': 'u:', 'ɔø': 'ɔ', 'iːɛ': 'i:', 'ʊə': 'ʊ', 'iə': 'i:', 'eə': 'ɛ', # Common sequences # 'əl': 'əl', # Keep this distinct sequence #'əl': 'o', # based on empirical confusion. theoretically, this should be merged with 'l' or 'e' but it's most confused with 'o' 'əl': 'l', 'n̩': 'n', 'ʃf': 'ʃ', 'eð': 'ð', 'ns': 'n', 'nd': 'n', 'ʃts': 'ts', # Special symbols 'SIL': 'SIL', 'noise': 'noise', # noise will be ignored by the model. CTC will take it as blank token. '': 'SIL', 'ʔ': 'noise', # Language markers to silence '(en)': 'SIL', '(es)': 'SIL', '(fr)': 'SIL', '(de)': 'SIL', '(it)': 'SIL', '(nl)': 'SIL', '(pl)': 'SIL', '(ru)': 'SIL', '(ptpt)': 'SIL', # Error cases to noise '??': 'noise', 'uk': 'noise', 'it': 'noise', 'ɡd': 'noise', 'rd': 'noise', 'as': 'noise', 'up': 'noise', 'os': 'noise', 'kf': 'noise', '1': 'noise', 'ʃd': 'noise', 'ʃz': 'noise', 'ʃn': 'noise', # Vowels 'y': 'y', # Map to existing long form 'yː': 'y', # Keep distinct high front rounded vowel 'œ': 'ø', # Map to closest unrounded vowel 'ø': 'ø', # Map to long version 'øː': 'ø', # Keep distinct mid front rounded vowel 'ɐ': 'ʌ', # Map to schwa 'aː': 'a:', # Keep long a #'oː': 'ɔ', # Map to similar long vowel 'oː': 'o:', # Keep distinct long o 'ɛː': 'ɛ', # Map to base form 'ɪː': 'i:', # Map to similar long vowel 'ɵ': 'ʊ', # Map to closest vowel 'ᵻ': 'ɪ', # Map to similar vowel # Double vowels (map to their long counterparts) 'aa': 'a', 'ɐɐ': 'a', 'ææ': 'æ', # Diphthongs 'yʊ': 'u', # Map to similar monophthong 'œy': 'ɔɪ', # Map to similar diphthong 'uɪ': 'aɪ', # Map to existing diphthong 'oɪ': 'ɔɪ', # Map to similar diphthong 'iʊ': 'u', # Map to similar monophthong 'aɪə': 'aɪ', # Map to base diphthong 'aɪɚ': 'aɪ', # Map to base diphthong # Nasal vowels 'ɐ̃ʊ̃': 'aʊ', # Map to oral diphthong 'œ̃': 'ɛ', # Map to oral vowel # Consonants 'ʝ': 'j', # Map to similar approximant 'ɟ': 'ʒ', # Map to similar affricate 'ʋ': 'v', # Map to similar fricative 'd̪': 'd', # Map dental to alveolar 't̪': 't', # Map dental to alveolar 'ɬ': 'l', # Map to plain lateral 'ʎ': 'l', # Map to plain lateral 'β': 'v', # Map to similar fricative 'ɡ': 'g', # Standardize to 'g' # Geminate consonants 'ɡː': 'g', # Map to single consonant 'tsː': 'ts', # Map to single affricate 'dzː': 'd', # Map to single affricate #'tʃː': 'ʃ', # Map to single affricate 'tʃː': 'tʃ', # Map to single affricate 'dʒː': 'dʒ', # Map to single affricate 'ss': 's', # Map to single consonant # Palatalized consonants 'ɡʲ': 'g', # Map to plain consonant # Sequences 'dɔ': 'noise', # Map unusual sequence to noise # These are found (with counts) in Google MSWC data, but not in the OpenSLR-MLS data # Complex sequences with frequency counts 'ja': 'j', # Common sequence (36,809) -> simplify to first component 'ju': 'j', # Common sequence (19,620) -> simplify to first component 'tʃʲ': 'tʃ', # Common palatalized affricate (32,707) -> map to fricative #'ɭ': 'l', # Very common retroflex lateral (78,504) -> map to alveolar 'ɭ': 'ɭ', 'ɭʲ': 'ɭʲ', # Common palatalized retroflex (61,298) -> map to plain lateral 'u"': 'u', # Quote variant (7,265) -> normalize to standard long u 'ɪ^': 'ɪ', # Rare diacritic variant (2,222) -> remove diacritic 'sz': 's', # Rare sequence (5) -> simplify to first component #'q': 'k', # Common uvular stop (75,838) -> map to velar 'q': 'q', # keep it EVEN though it's relively rare (45k) #'qː': 'k', # Rare long uvular (103) -> map to velar 'qː': 'q', 'r̝̊': 'ɹ', # Rare trilled/fricative r (3,099) -> map to approximant 'r̝': 'ɹ', # Rare variant (1,702) -> map to approximant 'r̩': 'ɹ', # Rare syllabic (1,272) -> map to approximant 'l̩': 'l', # Rare syllabic (536) -> map to standard lateral 'c': 'k', # Uncommon palatal stop (4,195) -> map to velar # Vowel sequences 'uɨ': 'ɨ', # Common sequence (20,396) -> map to monophthong 'aɨ': 'aɪ', # Common sequence (13,971) -> map to similar diphthong 'ɨu': 'u:', # Less common (7,653) -> map to monophthong 'ɪu': 'u:', # Uncommon (4,320) -> map to monophthong 'ɨː': 'ɨ', # Common variant (15,065) -> remove length marker 'ɑɨ': 'aɪ', # Common sequence (24,604) -> map to diphthong 'əɪ': 'eɪ', # Common sequence (12,209) -> map to similar diphthong 'əɨ': 'ɨ', # Less common (5,629) -> simplify to first component 'ɔɨ': 'ɔɪ', # Common sequence (16,178) -> map to similar diphthong 'ɪuː': 'u:', # Rare sequence (37) -> map to monophthong # Rare sequences: 1-5 occurrences --------------------------------------- # Some of the extremely rare consonant-consonant and vowel-consonant sequences map to 'noise' (i.e., ignored), most don't. # More nasal sequences 'nm': 'n', # was 'noise', map to alveolar nasal 'nn': 'n', # was 'noise', map to single nasal 'mn': 'm', # was 'noise', map to bilabial nasal 'mm': 'm', # was 'noise', map to single nasal 'na': 'n', # was 'noise', preserve nasal 'maː': 'm', # was 'noise', preserve nasal 'mz': 'm', # was 'noise', preserve nasal 'ms': 'm', # was 'noise', preserve nasal 'mf': 'm', # was 'noise', preserve nasal 'mɡ': 'm', # was 'noise', preserve nasal 'mx': 'm', # was 'noise', preserve nasal 'mv': 'm', # was 'noise', preserve nasal 'mʃ': 'm', # current mapping is good # Stop sequences 'dk': 'd', # was 'noise', preserve first stop 'dp': 'd', # was 'noise', preserve first stop 'db': 'd', # was 'noise', preserve first stop 'td': 't', # was 'noise', preserve first stop 'tb': 't', # was 'noise', preserve first stop 'tn': 't', # was 'noise', preserve stop # Long vowel sequences 'eːs': 'e:', # was 'noise', preserve long vowel 'eːt': 'e:', # was 'noise', preserve long vowel 'eːp': 'e:', # was 'noise', preserve long vowel 'eːf': 'e:', # current mapping is good 'eːz': 'e:', # current mapping is good 'eːj': 'e:', # current mapping is good 'eːx': 'e:', # current mapping is good 'eːʃ': 'e:', # current mapping is good 'oːs': 'o:', # current mapping is good 'oːb': 'o:', # current mapping is good # Vowel sequences 'ɑj': 'aɪ', # was 'noise', map to diphthong 'ɑh': 'a', # was 'noise', preserve vowel 'ɑm': 'a', # was 'noise', preserve vowel 'ɑk': 'a', # was 'noise', preserve vowel 'ɑn': 'a', # was 'noise', preserve vowel 'ɑq': 'a', # was 'noise', preserve vowel 'ɑt': 'a', # was 'noise', preserve vowel 'ɑo': 'a', # was 'noise', preserve first vowel 'ɑa': 'a', # was 'noise', preserve first vowel 'ɑaː': 'a:', # was 'noise', map to long vowel 'ɑuː': 'aʊ', # was 'noise', map to diphthong # Other sequences 'dʒv': 'dʒ', # current mapping is good 'bv': 'b', # was 'noise', preserve stop 'bh': 'b', # was 'noise', preserve stop 'ɡh': 'g', # was 'noise', preserve stop 'ɡz': 'g', # was 'noise', preserve stop 'hx': 'x', # current mapping is good 'ʃj': 'ʃ', # was 'noise', preserve fricative # Special cases '(fa)': 'SIL', # current mapping is good 'bb': 'b', # current mapping is good 'uːb': 'u:', 'uːk': 'u:', 'laɪ': 'noise', # --------------------------------------- End of rare sequences # Vowels and length variants 'əː': 'ə', # Long schwa maps to schwa (index 18) 'æː': 'æ', # Long ash maps to ash (index 32) 'æi': 'eɪ', # Map to similar diphthong (index 23) 'æiː': 'eɪ', # Map to similar diphthong (index 23) 'ɵː': 'ʊ', # Long rounded vowel maps to nearest equivalent (index 22) #'ɯ': 'ʊ', # Unrounded high back vowel maps to nearest equivalent (index 22) 'ɯ': 'ɯ', # Alternative transcription formats 'e:': 'e:', # Long e 'eː': 'e:', # Normalize colon to IPA length mark (index 43) #'e:': 'e', # NOT merged due to high confusion 'o:': 'o:', 'y:': 'y', # Normalize colon to IPA length mark (index 39) 'u:': 'u:', # Normalize colon to IPA length mark (index 5) 'i:': 'i:', # Normalize colon to IPA length mark (index 12) 'ɑ:': 'a', # Normalize colon to IPA length mark (index 13) 'oe:': 'ø', # Normalize colon to IPA length mark (index 40) 'oe': 'ø', # Map to equivalent (index 40) # ASCII-based transcription variants 'S': 's', # ASCII variant of 's' (index 21) 'N': 'n', # ASCII variant of 'n' (index 11) 'X': 'k', # ASCII variant, typically representing 'k' (index 27) 'tS': 'tʃ', # ASCII variant of 'tʃ' (index 1) 'dZ': 'dʒ', # ASCII variant of 'dʒ' (index 2) # Special characters and diacritics 't^': 't', # Remove diacritic (index 4) 's^': 's', # Remove diacritic (index 21) 'd^': 'd', # Remove diacritic (index 9) 't^ː': 't', # Remove diacritic and length (index 4) 't[': 't', # Remove bracket notation (index 4) 'd[': 'd', # Remove bracket notation (index 9) # Arabic phonemes 'ʕ': 'h', # Voiced pharyngeal fricative maps to nearest fricative (index 37) 'ħ': 'h', # Voiceless pharyngeal fricative maps to 'h' (index 37) 'dˤ': 'd', # Pharyngealized 'd' maps to plain 'd' (index 9) 's̪': 's', # Dental 's' maps to plain 's' (index 21) 'χ': 'x', # Voiceless uvular fricative maps to 'h' (index 37) 'dˤdˤ': 'd', # Doubled pharyngealized 'd' maps to 'd' (index 9) 'dd': 'd', # ASCII variant of doubled/pharyngealized 'd' (index 9) # Dot notation variants 'i.ː': 'i:', # Normalize dot notation (index 12) 'a.ː': 'a:', # Normalize dot notation (index 13) 'u.ː': 'u:', # Normalize dot notation (index 5) # Lateral approximant variant 'ɫ': 'l', # Velarized lateral maps to plain 'l' (index 16) # Consonant sequences (map to noise) 'kt': 'noise', # Consonant sequence (index 50) 'd̪w': 'noise', # Consonant sequence (index 50) 'wb': 'noise', # Consonant sequence (index 50) 'fm': 'noise', # Consonant sequence (index 50) # Vowel-consonant sequences (map to noise) 'ʊːt': 'noise', # Vowel-consonant sequence (index 50) 'aɪp': 'noise', # Vowel-consonant sequence (index 50) 'əm': 'noise', # Vowel-consonant sequence (index 50) 'aːn': 'a:', # Vowel-consonant sequence (index 50) 'iːe': 'i:', # Vowel-vowel sequence (index 50) 'yi': 'i:', # Vowel-vowel sequence (index 50) # Language markers (map to SIL) '(tt)': 'SIL', # Language marker (index 0) # Double long vowel - map to standard long vowel 'iːː': 'i:', # Excessive length mark, normalize to standard long i (index 12) # Doubled diphthong - map to single diphthong 'aɪaɪ': 'aɪ', # Repeated diphthong, map to single instance (index 7) # Consonant sequences - map to noise like other sequences 'ndʒ': 'dʒ', # Consonant cluster (index 50) 'tr': 'noise', # Consonant cluster (index 50) 'eβ': 'noise', # Vowel-consonant sequence (index 50) # Double palatalization - map to single palatalized form then apply existing mappings 'ʂʲ': 'ʃ', # Map palatalized retroflex to palato-alveolar (index 1) 'nʲʲ': 'nʲ', # Double palatalized nasal to plain nasal (index 11) 'tsʲ': 'ts', # Palatalized affricate follows affricate mapping (index 4) 'xʲ': 'h', # Palatalized velar fricative to h (index 37) 'dʑʲ': 'dʒ', # Palatalized voiced affricate to voiced palato-alveolar (index 2) 'ɕʲ': 'ɕ', # Palatalized alveolo-palatal to palato-alveolar (index 1) 'tɕʲ': 'ʃ', # Palatalized affricate to palato-alveolar (index 1) 'tʲʲ': 'tʲ', # Double palatalized stop to plain stop (index 4) 'ʒʲ': 'ʒ', # Palatalized palato-alveolar remains (index 2) 'ʃʲʲ': 'ʃ', # Double palatalized palato-alveolar remains (index 1) 'tsʲʲ': 'ts', # Double palatalized affricate to stop (index 4) 'ɾʲʲ': 'ɾ', # Double palatalized tap remains (index 48) 'zʲʲ': 'z', # Double palatalized fricative remains (index 36) 'ɾʲ': 'rʲ', # Palatalized tap remains (index 48) 'ʃʲ': 'ʃ', # Palatalized palato-alveolar remains (index 1) 'mʲʲ': 'm', # Double palatalized nasal to plain (index 28) 'ʲ': 'noise', # Isolated palatalization mark to noise (index 50) # Vowel sequences - map to nearest phoneme or diphthong 'uo': 'oʊ', # Map to nearest diphthong (index 24) 'ee': 'i:', # Map to long vowel (index 12) 'ie': 'i:', # Map to long vowel (index 12) 'ai': 'aɪ', # Map to standard diphthong (index 7) 'ui': 'u:', # Map to long vowel (index 5) 'au': 'aʊ', # Map to standard diphthong (index 8) 'eɑ': 'ɛ', # Map to nearest monophthong (index 6) 'iu': 'u:', # Map to long vowel (index 5) 'auː': 'aʊ', # Map to standard diphthong (index 8) 'ei': 'eɪ', # Map to standard diphthong (index 23) 'eu': 'oʊ', # Map to nearest diphthong (index 24) 'aiː': 'aɪ', # Map to standard diphthong (index 7) 'iuː': 'u:', # Map to long vowel (index 5) 'eiː': 'eɪ', # Map to standard diphthong (index 23) 'euː': 'oʊ', # Map to nearest diphthong (index 24) 'ɔa': 'ɔ', # Map to long vowel (index 3) 'yɪ': 'y', # Map to long vowel (index 39) 'iɪ': 'i:', # Map to long vowel (index 12) 'eo': 'oʊ', # Map to nearest diphthong (index 24) # Special notations 'cː': 'k', # Long palatal stop to velar (index 27) # All Chinese tonal patterns (with numbers) and complex sequences map to 'noise' # Examples: 'iɜk': 'noise', 'onɡ5': 'noise', 'ts.': 'ts', 'ə5': 'noise', 'ŋf': 'noise', 'u2': 'noise', 'oɜɕ': 'noise', 'iɜ': 'noise', # MLS-fr # Consonant sequences to noise 'ls': 'noise', # Lateral + fricative sequence maps to noise (50) 'll': 'noise', # Double lateral sequence maps to noise (50) # Vowel-consonant sequences to noise 'øːl': 'noise', # Long oe + lateral sequence maps to noise (50) 'øːs': 'noise', # Long oe + fricative sequence maps to noise (50) # from UCLA Phonetics Dataset # Syllabic consonants - map to their non-syllabic counterparts 'h̩': 'h', # Syllabic h to h (37) 'ɹ̩': 'ɹ', # Syllabic r to r (17) 'ŋ̩': 'ŋ', # Syllabic ng to ng (34) 'ɫ̩': 'l', # Syllabic dark l to l (16) 'v̩': 'v', # Syllabic v to v (15) 'm̩': 'm', # Syllabic m to m (28) # Aspirated consonants - map to unaspirated counterparts 'pʰ': 'p', # Aspirated p to p (25) 'tʰ': 't', # Aspirated t to t (4) 'kʰ': 'k', # Aspirated k to k (27) 'sʰ': 's', # Aspirated s to s (21) 'ʃʰ': 'ʃ', # Aspirated sh to sh (1) 'cʰ': 'k', # Aspirated c to k (27) 't͡sʰ': 'ts', # Aspirated ts to t (4) 't͡ʃʰ': 'tʃ', # Aspirated tsh to sh (1) 'ɕʰ': 'ɕ', # Aspirated alveolo-palatal to sh (1) # Labialized consonants - map to base consonants 'tʷ': 't', # Labialized t to t (4) 'kʷ': 'k', # Labialized k to k (27) 'pʷ': 'p', # Labialized p to p (25) 'ʒʷ': 'ʒ', # Labialized zh to zh (2) 'xʷ': 'h', # Labialized x to h (37) 'dʷ': 'd', # Labialized d to d (9) 'bʷ': 'b', # Labialized b to b (26) 'mʷ': 'm', # Labialized m to m (28) 'ŋʷ': 'ŋ', # Labialized ng to ng (34) # Retroflexes - map to closest non-retroflex 'ʈ': 't', # Retroflex t to t (4) 'ɖ': 'd', # Retroflex d to d (9) 'ɳ': 'n', # Retroflex n to n (11) 'ɻ': 'ɹ', # Retroflex r to r (17) 'ɽ': 'ɾ', # Retroflex flap to tap (48) # Breathy voiced - map to regular voiced 'n̤': 'n', # Breathy n to n (11) 'b̤': 'b', # Breathy b to b (26) 'j̤': 'j', # Breathy j to j (29) 'a̤': 'a', # Breathy a to long a (30) 'i̤ː': 'i:', # Breathy long i to long i (12) 'o̤': 'o', # Breathy o to o (44) 'o̤ː': 'o:', # Breathy long o to o (44) # Nasalized vowels - map to oral counterparts 'ãː': 'a:', # Nasalized long a to long a (30) 'ẽ': 'e', # Nasalized e to e (42) 'ɪ̃': 'ɪ', # Nasalized short i to short i (31) 'ỹ': 'y', # Nasalized y to long y (39) 'õː': 'o:', # Nasalized long o to o (44) 'æ̃': 'æ', # Nasalized ae to ae (32) 'ʌ̃': 'ʌ', # Nasalized wedge to schwa (18) 'ə̃': 'ə', # Nasalized schwa to schwa (18) 'ã': 'a', # Nasalized a to long a (30) 'ĩ': 'i:', # Nasalized i to long i (12) 'ĩː': 'i:', # Nasalized long i to long i (12) 'ũː': 'u:', # Nasalized long u to long u (5) # Affricates - map to primary component 't͡s': 'ts', # ts to t (4) 't͡ʃ': 'tʃ', # tsh to sh (1) 'd͡ʒ': 'dʒ', # dzh to zh (2) 't͡ɬ': 't', # tl to t (4) # Ejectives - map to non-ejective counterparts 'tʼ': 't', # Ejective t to t (4) 'kʼ': 'k', # Ejective k to k (27) 'qʼ': 'q', # Ejective q to k (27) 'pʼ': 'p', # Ejective p to p (25) 'sʼ': 's', # Ejective s to s (21) # Additional vowels 'ʏ': 'ɪ', # Near-close near-front rounded to short i (31) 'ʏː': 'y', # Long near-close near-front rounded to long y (39) 'ʊː': 'ʊ', # Long near-close near-back rounded to short u (22) 'ɤ': 'ə', # Close-mid back unrounded to schwa (18) 'ɤː': 'ə', # Long close-mid back unrounded to schwa (18) 'œː': 'ø', # Long open-mid front rounded to long oe (40) 'ɯː': 'u:', # Long close back unrounded to long u (5) 'ɛ̤': 'ɛ', # Breathy open-mid front unrounded to epsilon (6) # Short/reduced vowels 'ĕ': 'e', # Short e to e (42) 'ă': 'a', # Short a to long a (30) 'ĭ': 'ɪ', # Short i to short i (31) 'ŏ': 'o', # Short o to o (44) 'ŭ': 'ʊ', # Short u to short u (22) # Laryngealized/creaky vowels - map to regular vowels 'ḛ': 'e', # Creaky e to e (42) 'ḭ': 'i', # Creaky i to i (41) 'o̰': 'o', # Creaky o to o (44) 'ɛ̰': 'ɛ', # Creaky epsilon to epsilon (6) 'a̰': 'a', # Creaky a to long a (30) 'ʊ̰': 'ʊ', # Creaky upsilon to upsilon (22) # Additional consonants 'ɦ': 'h', # Voiced h to h (37) 'ʍ': 'w', # Voiceless w to w (47) 'ɢ': 'g', # Uvular g to g (10) 'ɱ': 'm', # Labiodental nasal to m (28) 'ʔ': 'noise', # Glottal stop to noise (50) 'ɮ': 'z', # Voiced lateral fricative to z (36) 'ɸ': 'f', # Bilabial fricative to f (20) # Co-articulated stops 'k͡p': 'k', # was 'noise', map to velar stop as it's typically more salient 'ɡ͡b': 'g', # was 'noise', map to velar stop (voiced counterpart) 'p͡t': 'p', # was 'noise', map to first stop in sequence 'b͡d': 'b', # was 'noise', map to first stop in sequence # Lengthened consonants 'ʔː': 'q', # was 'noise', map to closest glottal/uvular stop in inventory 'hː': 'h', # was 'noise', map to plain glottal fricative 'æ̆': 'æ', # Short ae to ae (32) 'ɜ̆': 'ə', # Short epsilon to long epsilon (33) 'ɔ̆': 'ʌ', # Short open-o to long open-o (3) 'ə̠': 'ʌ', # Retracted schwa (when it appears in stressed positions) 'ə̆': 'ə', # Short schwa to schwa (18) 'ɒː': 'a:', # Long open-o to long open-o (3) # Aspirated and modified affricates 'd͡ʒʰ': 'dʒ', # Aspirated dzh to zh (2) 't͡sʼ': 'ts', # Ejective ts to t (4) 't͡ʃʼ': 'tʃ', # Ejective tsh to sh (1) 't͡ɬʼ': 't', # Ejective tl to t (4) 't͡ʃʲ': 'tʃ', # Palatalized tsh to sh (1) 'd͡ʒʲ': 'dʒ', # Palatalized dzh to zh (2) # Voiceless sonorants 'e̥': 'e', # Voiceless e to e (42) 'ɲ̥': 'ɲ', # Voiceless ny to ny (38) 'm̥': 'm', # Voiceless m to m (28) 'n̥': 'n', # Voiceless n to n (11) 'l̥': 'l', # Voiceless l to l (16) 'r̥': 'ɹ', # Voiceless r to r (17) 'ŋ̥': 'ŋ', # Voiceless ng to ng (34) 'i̥': 'i', # Voiceless i to i (41) 'u̥': 'u:', # Voiceless u to long u (5) 'ʎ̥': 'l', # Voiceless palatal l to l (16) # Long consonants 'tʰː': 't', # Long aspirated t to t (4) 'çː': 'ç', # Long palatal fricative to h (37) 'xː': 'h', # Long x to h (37) 'ɟː': 'ʒ', # Long palatal stop to zh (2) 'l̪ː': 'l', # Long dental l to l (16) 'pʰː': 'p', # Long aspirated p to p (25) 'θː': 'θ', # Long th to th (46) 'ɲː': 'ɲ', # Long ny to ny (38) 'wː': 'w', # Long w to w (47) # Modified velars 'kʰʲ': 'k', # Palatalized aspirated k to k (27) 'kʼʲ': 'k', # Palatalized ejective k to k (27) 'qʰʷ': 'q', # Labialized aspirated q to k (27) 'kʰʷ': 'k', # Labialized aspirated k to k (27) 'kʷʰ': 'k', # Labialized aspirated k to k (27) 'kʷʼ': 'k', # Labialized ejective k to k (27) 'qʷ': 'q', # Labialized q to k (27) 'qʷʼ': 'q', # Labialized ejective q to k (27) 'qʰ': 'q', # Aspirated q to k (27) 'q̠': 'q', # Retracted q to k (27) 'ɢʲ': 'g', # Palatalized uvular g to g (10) 'ɡʷ': 'g', # Labialized g to g (10) # Rhotic vowels 'e˞': 'ɚ', # Rhotacized e to schwar (14) 'a˞': 'ɚ', # Rhotacized a to schwar (14) 'o˞': 'ɚ', # Rhotacized o to schwar (14) 'u˞': 'ɚ', # Rhotacized u to schwar (14) 'i˞': 'ɚ', # Rhotacized i to schwar (14) # Nasalized variants 'ɛ̃ː': 'ɛ', # Long nasalized epsilon to epsilon (6) 'ʊ̃': 'ʊ', # Nasalized upsilon to upsilon (22) 'z̃': 'z', # Nasalized z to z (36) 'j̃': 'j', # Nasalized j to j (29) 'w̃': 'w', # Nasalized w to w (47) 'ʊ̰̃': 'ʊ', # Creaky nasalized upsilon to upsilon (22) 'æ̃ː': 'æ', # Long nasalized ae to ae (32) 'ɔ̃ː': 'ɔ', # Long nasalized open-o to long open-o (3) 'ɛ̰̃': 'ɛ', # Creaky nasalized epsilon to epsilon (6) # Modified dentals/alveolars 'd̪ʰ': 'd', # Aspirated dental d to d (9) 't̪ʰ': 't', # Aspirated dental t to t (4) 't̪ʲ': 'tʲ', # Palatalized dental t to t (4) 'tʲʰ': 'tʲ', # Palatalized aspirated t to t (4) 'dʰ': 'd', # Aspirated d to d (9) 'ðʲ': 'ð', # Palatalized eth to eth (35) 'zʲ': 'z', # Palatalized z to z (36) 'zʷ': 'z', # Labialized z to z (36) # Complex modifications 'ʃʷ': 'ʃ', # Labialized sh to sh (1) 'ɕʷ': 'ɕ', # Labialized alveolo-palatal to sh (1) 'ʑʷ': 'ʒ', # Labialized voiced alveolo-palatal to zh (2) 'ʕʷ': 'h', # Labialized pharyngeal to h (37) 'ħʷ': 'h', # Labialized voiceless pharyngeal to h (37) 'ʁʷ': 'ɹ', # Labialized uvular to r (17) 'χʲ': 'h', # Palatalized x to h (37) 'hʲ': 'h', # Palatalized h to h (37) # Retracted/advanced variants 'ɨ̠': 'ɨ', # Retracted barred-i to barred-i (45) 'ʊ̠': 'ʊ', # Retracted upsilon to upsilon (22) 'ʊ̟': 'ʊ', # Advanced upsilon to upsilon (22) 'æ̟': 'æ', # Advanced ae to ae (32) 'ə̟': 'ə', # Advanced schwa to schwa (18) # Dental variants 'n̪': 'n', # Dental n to n (11) 'l̪': 'l', # Dental l to l (16) # Special vowels 'ö': 'ø', # O-umlaut to long oe (40) 'ü': 'y', # U-umlaut to long y (39) 'ʉ': 'ɨ', # Central u to long u (5) 'ɞ': 'ə', # Open-mid central rounded to schwa (18) 'ɤ̈': 'ə', # Advanced close-mid back unrounded to schwa (18) 'ɯ̈': 'ɨ', # Advanced high back unrounded # Implosives/ejectives/glottalized 'ɗ': 'd', # Implosive d to d (9) 'ɓ': 'b', # Implosive b to b (26) 'ʄ': 'ʒ', # Implosive palatal to zh (2) 'dˀ': 'd', # Glottalized d to d (9) 'bˀ': 'b', # Glottalized b to b (26) 'ˀa': 'a', # Preglottalized a to long a (30) # Modified retroflexes 'ʈʰ': 't', # Aspirated retroflex t to t (4) 'ɖʰ': 'd', # Aspirated retroflex d to d (9) # Remaining special cases 'ɥ': 'j', # Labial-palatal approximant to j (29) 'ʀ': 'ɹ', # Uvular trill to r (17) 'ɹ̝': 'ɹ', # Raised r to r (17) 'ṽ': 'v', # Nasalized v to v (15) 'ə̥': 'ə', # Voiceless schwa to schwa (18) 'ə̯': 'ə', # Non-syllabic schwa to schwa (18) 'i̯': 'i', # Non-syllabic i to i (41) 'l̴': 'l', # Velarized l to l (16) 'dⁿ': 'd', # Prenasalized d to d (9) 'tⁿ': 't', # Prenasalized t to t (4) # Breathy/creaky variants 'd̪̤': 'd', # Breathy dental d to d (9) 'ɑ̤': 'a', # Breathy long a to long a (13) 'ṳː': 'u:', # Breathy long u to long u (5) 'ṳ': 'u:', # Breathy u to long u (5) 'ɯ̤': 'u:', # Breathy unrounded u to long u (5) 'ɪ̰': 'ɪ', # Creaky short i to short i (31) 'ɔ̰': 'ɔ', # Creaky open-o to long open-o (3) 'ɔ̤': 'ɔ', # Breathy open-o to long open-o (3) # Height/backness variants 'ɑ̝': 'a', # Raised long a to long a (13) 'ɛ̞': 'ɛ', # Lowered epsilon to epsilon (6) 'ɛ̝': 'ɛ', # Raised epsilon to epsilon (6) 'e̝': 'e', # Raised e to e (42) 'o̝': 'o', # Raised o to o (44) 'u̝': 'u:', # Raised u to long u (5) 'ɑ̞': 'a', # Lowered long a to long a (13) 'a̘': 'a', # Advanced tongue root a to long a (30) 'ä': 'a', # Centralized a to long a (30) # Modified vowel quality 'ɛ̈': 'ɛ', # Centralized epsilon to epsilon (6) 'œ̈': 'ø', # Centralized oe to long oe (40) 'ʌ̈': 'ʌ', # Centralized wedge to schwa (18) 'ɛ̠': 'ɛ', # Retracted epsilon to epsilon (6) 'a̠': 'a', # Retracted a to long a (30) 'o̠': 'o', # Retracted o to o (44) 'i̠': 'i', # Retracted i to i (41) # Remaining consonant variants 't̠': 't', # Retracted t to t (4) 'd̠': 'd', # Retracted d to d (9) 'n̠': 'n', # Retracted n to n (11) 't̟': 't', # Advanced t to t (4) 'r̟': 'ɹ', # Advanced r to r (17) 'r̠': 'ɹ', # Retracted r to r (17) 'rˠ': 'ɹ', # Velarized r to r (17) 'ɪ̥': 'ɪ', # Voiceless short i to short i (31) 'ʔʷ': 'noise', # Labialized glottal stop to noise (50) 'ɕʼ': 'ɕ', # Ejective alveolo-palatal to sh (1) 'cʼ': 'k', # Ejective c to k (27) 'cʷʰ': 'k', # Labialized aspirated c to k (27) 'w̝': 'w', # Raised w to w (47) 'ʃ̠': 'ʃ', # Retracted sh to sh (1) 'ɪ̰̃': 'ɪ', # Creaky nasalized short i to short i (31) 'tʷʼ': 't', # Labialized ejective t to t (4) 'ŋʲ': 'ŋ', # Palatalized ng to ng (34) 'bʰ': 'b', # Aspirated b to b (26) 'æ̈': 'æ', # Centralized ae to ae (32) 'ɘ': 'ə', # Close-mid central unrounded vowel to schwa (18) 'tsʰ': 'ts', # Aspirated ts to ts (4) 'r̩ː': 'ɚ', # Long rhotic schwa to schwar (14) } def get_compound_phoneme_mapping(phoneme): # First try direct mapping if phoneme in phoneme_mapping: return phoneme_mapping[phoneme] # For compound phonemes, map components and combine mapped = "" remaining = phoneme while remaining: found = False # Try to match longest possible substring first for i in range(len(remaining), 0, -1): subset = remaining[:i] if subset in phoneme_mapping: mapped += phoneme_mapping[subset] remaining = remaining[i:] found = True break if not found: # If no mapping found for current character, treat as noise remaining = remaining[1:] return mapped if mapped else "noise" def create_normalized_mapping(mapping_dict): # Create normalized version of the mapping from unicodedata import normalize """Create a mapping dictionary with normalized Unicode characters.""" return { normalize('NFC', key): normalize('NFC', value) for key, value in mapping_dict.items() } phoneme_mapper = create_normalized_mapping(phoneme_mapping) #Both the 'key' and value ar normalized #print(phoneme_mapper) def analyze_phoneme_merger(phoneme_mapper): # Check for circular references def check_circular_refs(mapper): issues = [] for phoneme, target in mapper.items(): if target in mapper and mapper[target] != target: issues.append(f"Potential circular reference: {phoneme} -> {target} -> {mapper[target]}") return issues # Check for consistency in vowel merging def check_vowel_consistency(mapper): issues = [] # Common vowel pairs that should merge consistently vowel_pairs = [ ('ɑː', 'ɑːɹ'), # Long a with/without r ('ɔː', 'ɔːɹ'), # Long o with/without r ('iː', 'iə'), # Long i and i-schwa ('ʊ', 'ʊɹ'), # Short u with/without r ] for v1, v2 in vowel_pairs: if v1 in mapper and v2 in mapper: if mapper[v1] != mapper[v2]: issues.append(f"Inconsistent vowel mapping: {v1} -> {mapper[v1]} but {v2} -> {mapper[v2]}") return issues # Check for r-colored vowel consistency def check_r_colored_consistency(mapper): issues = [] r_colored = ['ɪɹ', 'ɛɹ', 'ʊɹ'] target = 'ɚ' # All should map to schwa-r for phoneme in r_colored: if phoneme in mapper and mapper[phoneme] != target: issues.append(f"Inconsistent r-colored vowel: {phoneme} -> {mapper[phoneme]}, expected -> {target}") return issues # Check compound phoneme handling def check_compound_handling(mapper): issues = [] for phoneme in mapper: if len(phoneme) > 1 and phoneme not in ['tʃ', 'dʒ', 'aɪ', 'eɪ', 'oʊ', 'aʊ', 'ɔɪ', 'iə', 'uː', 'iː', 'ɑː', 'ɔː', 'ɜː', 'əl']: if not phoneme.startswith(mapper[phoneme][0]): issues.append(f"Potentially incorrect compound mapping: {phoneme} -> {mapper[phoneme]}") return issues # Collect all issues all_issues = [] all_issues.extend(check_circular_refs(phoneme_mapper)) all_issues.extend(check_vowel_consistency(phoneme_mapper)) all_issues.extend(check_r_colored_consistency(phoneme_mapper)) #all_issues.extend(check_compound_handling(phoneme_mapper)) print("Testing complete vocab:") for kv in list(complete_vocab.keys()): mapped = get_compound_phoneme_mapping(kv) if (mapped != kv): if (mapped == 'noise') or (complete_vocab[kv] > 5000): print(f"{kv} -> {mapped} \tcount: {complete_vocab[kv]}") # Verify coverage missing_phonemes = set(complete_vocab.keys()) - set(phoneme_mapping.keys()) print(f"Missing phonemes: {missing_phonemes}") for phoneme in missing_phonemes: print(f"{phoneme} -> {complete_vocab[phoneme]}") return all_issues def create_new_index(): # First, count the frequencies mapping to the new phonemes (count merged branches) phoneme_vocab_mapped_counts = {} for key, value in phoneme_mapper.items(): if value not in phoneme_vocab_mapped_counts: phoneme_vocab_mapped_counts[value] = 0 phoneme_vocab_mapped_counts[value] += 1 print(f"Mapped: {len(phoneme_mapper)} phonemes onto {len(phoneme_vocab_mapped_counts)} phonemes") #print(phoneme_mapper) # Sort phonemes by frequency in descending order, excluding SIL sorted_phonemes = sorted( [p for p in phoneme_vocab_mapped_counts.keys() if p not in ['SIL', 'noise']], key=lambda x: phoneme_vocab_mapped_counts[x], reverse=True ) # Create the index mapping phoneme_mapped_index = {} # Put SIL at index 0 phoneme_mapped_index['SIL'] = 0 # Add the rest of the phonemes with indices starting from 1 for i, phoneme in enumerate(sorted_phonemes): phoneme_mapped_index[phoneme] = i + 1 # Put noise at the last index phoneme_mapped_index['noise'] = len(sorted_phonemes) + 1 print("New index created:") print(phoneme_mapped_index) print("Unique phonemes in the new index:") print(list(phoneme_mapped_index.keys())) # Run the analysis issues = analyze_phoneme_merger(phoneme_mapper) # Print findings print("Found the following potential issues:") for i, issue in enumerate(issues, 1): print(f"{i}. {issue}") # Additional validation of the phoneme_mapped_index mapped_phonemes = set(phoneme_mapped_index.keys()) merger_outputs = set(p for p in phoneme_mapper.values() if not p.endswith('*')) missing_indices = merger_outputs - mapped_phonemes extra_indices = mapped_phonemes - merger_outputs print("\nIndex validation:") if missing_indices: print(f"Merged phonemes missing from index: {missing_indices}") if extra_indices: print(f"Extra phonemes in index: {extra_indices}") print("Done") def check_missing_phonemes(): test_phonemes = ['a', 'd͡ʒ', 'ʃʲ', 'm', 'ɜ', 'ɘ', 'ʃ', 't͡ʃʰ', 'r', 'ä', 't͡ʃ', 'ə̆', 'pʰ', 'ɜ̆', 'ʌ̈', 't', 'ʃʰ', 'kʼ', 'ʒʲ', 'ə', 'ă', 'b', 'ɨ', 'æ̈', 'j', 'ɛ̈', 'p', 'd', 'n', 'ɥ', 'ɡ', 't͡ʃʼ', 'χ', 'ˀa', 'ʒ', 'ħʷ', 'ɹ', 'ħ', 'œ̈', 'ɾ', 'ʁ', 'ɤ̈', 'z', 'i', 'χʲ', 'tʰ', 's', 'ʁʷ', 'h', 'ɛ', 'k', 'ɑ', 'x', 'ɔ', 'o', 'u', 'e', 'ɑ̃', 'ŋ', 'l', 'ʊ', 'ã', 'q̠', 'õ', 'w', 'β', 'f', 'v', 'ʎ', 'oː', 'eː', 'kʰ', 'ð', 'œ', 'ɹ̩', 'ɛ̝', 'ʔ', 'l̥', 'e̝', 'aː', 'uː', 'iː', 'ʌ̃', 'æ', 'ẽ', 'y', 'yː', 'ɪː', 'ɛː', 'øː', 'œː', 'ɑː', 'o̝', 'ʌ', 'ø', 'ɯ', 'sː', 'ɛ̃', 'c', 'ɪ', 'ɟ', 'ɲ', 'æː', 'æ̃ː', 'ʉ', 'ɫ̩', 'ʋ', 'ɫ', 'kʲ', 'ɣ', 'ɦ', 'n̩', 'ɸ', 'dʰ', 'm̩', 'h̩', 'ç', 'bʰ', 't̪', 'd̪', 'd̪̤', 'b̤', 'n̪', 'ĩ', 'ũː', 'ũ', 'j̤', 'l̪', 'pː', 'kː', 'rː', 'nː', 'l̪ː', 'bː', 'mː', 'ɞ', 't̪ʲ', 'hː', 'ʔː', 'tː', 'dː', 'ʈ', 'ɖ', 'ʂ', 'ʐ', 'r̥', 'ɔː', 'ʏː', 'ʏ', 'θ', 'n̥', 'cː', 'ɟː', 'fː', 'lː', 'ŋ̥', 'ə̯', 'ə̟', 'i̯', 'ʊ̟', 'ɛ̞', 'ʊ̠', 'r̟', 'r̠', 'ɕ', 'pʲ', 'bʲ', 'ŭ', 'tʲ', 'ĕ', 'dʲ', 'ɡʲ', 'nʲ', 'fʲ', 'zʲ', 'vʲ', 'lʲ', 'sʲ', 'xʲ', 'hʲ', 'ŏ', 'mʲ', 't͡ʃʲ', 'd͡ʒʲ', 'æ̆', 'ŋʲ', 'rʲ', 'ɾʲ', 'ĭ', 'ɔ̆', 's̪', 'ɱ', 'ɽ', 'ɳ', 'ʈʰ', 'ɖʰ', 'ɵ', 't̪ʰ', 'd͡ʒʰ', 'ɭ', 'ʊ̃', 'sʰ', 'ḭ', 'cʰ', 'ʊ̰', 'ɛ̰', 'ɪ̰', 'a̰', 'ḛ', 'o̰', 'ɛ̰̃', 'ɪ̃', 'ʊ̰̃', 'ɲ̥', 'æ̃', 'm̥', 'ɪ̰̃', 'ɔ̰', 'wː', 'ɔ̃ː', 'ɗ', 'ɔ̃', 'õː', 'ɯː', 'ə̃', 'tʰː', 'pʰː', 'vː', 'zː', 'ʃː', 'jː', 'ɲː', 'xː', 'çː', 'ɓ', 'ãː', 't͡sʼ', 'ɻ', 'ʀ', 't͡s', 'a', 'b', 'w', 'e', 'ɔ', 'p', 'ɛ', 't', 'o', 't͡ʃ', 'u', 'd', 'k', 'ɔ̃', 'kʷ', 'ɡ', 'k͡p', 'm', 'n', 'n̠', 'j', 'f', 's', 'ç', 'ɹ', 'l', 'i', 'ʍ', 'd̠', 'ʐ', 'ŋ', 'ɥ', 't̠', 'ɕʷ', 'ɕ', 'pʰ', 'tʰ', 'sʰ', 'kʰ', 'z', 'ä', 'h', 'v', 'ʃ', 'ʒ', 'r', 'ü', 'y', 'ʔ', 'ɪ', 'æ', 'ə', 'q̠', 'ɞ', 't͡ʃʰ', 'ĩ', 'ã', 'õ', 'ʋ', 'x', 'ɾ', 'ɓ', 'ɗ', 'c', 'ɟ', 'ʄ', 'aː', 'ɲ', 'ɔː', 'tʲ', 'oː', 'ɤː', 'uː', 'ʊː', 'ɳ', 'ɯː', 'ðʲ', 'tʲʰ', 'ɛ̃', 'ɣ', 'kʲ', 'ũ', 'ĩː', 'rˠ', 'ɛ̃ː', 'ãː', 'ɔ̃ː', 'ũː', 't̪', 'ʑʷ', 'ʑ', 'ɡʷ', 'ŋʷ', 'ɽ', 'o̠', 'w̃', 'ɯ', 'ö', 'ɡ͡b', 'd͡ʒ', 'ʁ', 'q', 'i̠', 'ɛ̠', 'v̩', 'l̥', 'ɤ', 'r̥', 'ɢ', 'ɢʲ', 'χ', 'kʰʲ', 'm̥', 'n̥', 'nː', 'pː', 'lː', 'rː', 'æː', 'eː', 'o˞', 'e˞', 'a˞', 'i˞', 'iː', 'u˞', 'ʕʷ', 'ʕ', 'xʷ', 'ɬ', 'qʷ', 'ɑ', 'ɪ̃', 'ẽ', 'ʊ', 'd̪', 'd͡ʒʰ', 'ɦ', 't̪ʰ', 'd̪ʰ', 'dʰ', 'bʰ', 'ʌ', 'pʼ', 'ʊ̃', 'kʼ', 'β', 'kʼʲ', 'ħ', 'qʼ', 'cʼ', 'kʰʷ', 'qʰʷ', 'ɨ', 'ð', 'ɖ', 'ɸ', 'ʏ', 'ø', 'l̩', 'dʷ', 'pʷ', 'bʷ', 'tʷ', 'ṽ', 'z̃', 'ʃʷ', 'ʒʷ', 'a̘', 't͡s', 'n̤', 'ŋ̩', 'h̩', 'ɹ̝', 'ɑː', 'ɑ̞', 'ɑ̝', 'ɛː', 'ɪː', 'u̝', 'sʲ', 'ɜ', 'ɨː', 'θ', 'l̴', 'n̩', 'j̃', 't͡ɬ', 'sʼ', 'kʷʼ', 'cʰ', 'qʷʼ', 'zʷ', 'qʰ', 'kʷʰ', 't͡ɬʼ', 'cʷʰ', 'ʁʷ', 'tʷʼ', 'a̤', 'ɔ̤', 'o̤ː', 'i̤ː', 'ṳ', 'o̤', 'ṳː', 'ɯ̤', 'tʼ', 'ɑ̃', 'ɫ', 'ɑ̤', 'ʌ̃', 'ɛ̤', 'p͡t', 'b͡d', 'mʷ', 'w̝', 'ʎ̥', 'ɮ', 'ʃ̠', 'fː', 'i̥', 'u̥', 'ɪ̥', 'zː', 'sː', 'ʎ', 'ə̥', 'ʃː', 'e̥', 'ỹ', 'ɯ̈', 'ʉ', 'ɒ', 'xː', 'l̪', 'n̪', 'θː', 'ɒː', 'dˀ', 'bˀ', 't̟', 'æ̟', 'dⁿ', 'ɨ̠', 'tⁿ', 'a̠', 't͡sʰ', 'ɕʰ', 'm̩', 'ɭ', 'ə̃', 'ɕʼ', 't͡ʃʼ', 'ʔʷ', 'tsʰ'] # from UCLA phonetics, some repeated missing_phonemes = set(test_phonemes) - set(phoneme_mapper.keys()) print(f"Missing phonemes: {missing_phonemes}") print(len(missing_phonemes)) # list of phonemes that map to noise: noise_phonemes = [k for k, v in phoneme_mapper.items() if v == 'noise'] noise_phonemes_in_test_set = set(noise_phonemes) & set(test_phonemes) print(f"Noise phonemes in test set: {noise_phonemes_in_test_set}") # only {'ʔ', 'ʔʷ'} are mapped to noise from ucla dataset def check_duplicates(): from collections import defaultdict # Create a dictionary to store the key-value pairs key_value_pairs = defaultdict(set) # Populate the key-value pairs for key, value in phoneme_mapper.items(): key_value_pairs[key].add(value) # Find and print keys with multiple different values duplicates = {key: values for key, values in key_value_pairs.items() if len(values) > 1} print("Duplicate keys with different values:", len(duplicates)) for key, values in duplicates.items(): print(f"Key '{key}' has different values: {values}") def make_phoneme_groups(): phoneme_groups_19 = { # Vowels - Separated by height and frontness "high_front_vowels": ["i", "i:", "ɪ", "y", "ʏ", "iː"], "high_back_vowels": ["u", "u:", "ʊ", "ɯ", "ʉ", "ɨ", "uː"], "mid_front_vowels": ["e", "e:", "ɛ", "ø", "œ", "eː"], "mid_central_vowels": ["ə", "ɜ", "ɜ:", "ɚ", "ʌ", "ɘ", "ɵ"], "mid_back_vowels": ["o", "o:", "ɔ", "ɔ:", "ɤ", "oː"], "low_vowels": ["a", "a:", "æ", "ɐ", "ɑ", "ɑ:", "ɒ", "aː"], "diphthongs": ["aɪ", "eɪ", "ɔɪ", "aʊ", "oʊ", "ɛə", "ɪə", "ʊə"], # Consonants - Organized by manner and voicing "voiceless_stops": ["p", "t", "k", "q", "ʔ", "ʈ", "c"], "voiced_stops": ["b", "d", "g", "ɢ", "ɖ", "ɟ"], "voiceless_fricatives": ["f", "θ", "s", "ʃ", "ç", "x", "h", "ħ", "ʂ", "ɕ", "χ"], "voiced_fricatives": ["v", "ð", "z", "ʒ", "ʝ", "ɣ", "ʕ", "ʐ", "ʑ", "ʁ"], "voiceless_affricates": ["ts", "tʃ", "tɕ", "ʈʂ"], "voiced_affricates": ["dz", "dʒ", "dʑ", "ɖʐ"], "nasals": ["m", "n", "ɲ", "ŋ", "ɴ", "ɱ", "ɳ"], # Liquids, glides, and palatalized sounds "laterals": ["l", "ɭ", "ʎ", "ʟ"], "rhotics": ["r", "ɾ", "ɹ", "ʀ", "ɽ", "ɻ"], "glides": ["j", "w", "ɥ", "ɰ"], "palatalized": ["ɭʲ", "rʲ", "tʲ", "nʲ"], "SIL": ["SIL"], "noise": ["noise"], } phoneme_groups = { # Vowels - Adjusted based on confusion patterns "front_vowels": ["i", "i:", "ɪ", "y", "ʏ", "iː", "e", "e:", "ɛ", "ø", "œ", "eː"], # Merged high/mid front "central_vowels": ["ə", "ɜ", "ɜ:", "ɚ", "ʌ", "ɘ", "ɵ"], # Keep central vowels separate "back_vowels": ["u", "u:", "ʊ", "ɯ", "ʉ", "ɨ", "uː", "o", "o:", "ɔ", "ɔ:", "ɤ", "oː"], # Merged high/mid back "low_vowels": ["a", "a:", "æ", "ɐ", "ɑ", "ɑ:", "ɒ", "aː"], # Keep low vowels separate "diphthongs": ["aɪ", "eɪ", "ɔɪ", "aʊ", "oʊ", "ɛə", "ɪə", "ʊə"], # Keep diphthongs separate # Consonants - Maintain voicing distinction for stops and fricatives "voiceless_stops": ["p", "t", "k", "q", "ʔ", "ʈ", "c", "tʲ"], # Add palatalized t "voiced_stops": ["b", "d", "g", "ɢ", "ɖ", "ɟ"], "voiceless_fricatives": ["f", "θ", "s", "ʃ", "ç", "x", "h", "ħ", "ʂ", "ɕ", "χ"], "voiced_fricatives": ["v", "ð", "z", "ʒ", "ʝ", "ɣ", "ʕ", "ʐ", "ʑ", "ʁ"], # Keep affricates distinction by voicing "voiceless_affricates": ["ts", "tʃ", "tɕ", "ʈʂ"], "voiced_affricates": ["dz", "dʒ", "dʑ", "ɖʐ"], # Merge palatalized nasals with base nasals "nasals": ["m", "n", "nʲ", "ɲ", "ŋ", "ɴ", "ɱ", "ɳ"], # Merge palatalized laterals with base laterals "laterals": ["l", "ɭ", "ɭʲ", "ʎ", "ʟ"], # Merge palatalized rhotics with base rhotics "rhotics": ["r", "rʲ", "ɾ", "ɹ", "ʀ", "ɽ", "ɻ"], # Keep glides separate "glides": ["j", "w", "ɥ", "ɰ"], # Special tokens "SIL": ["SIL"], "noise": ["noise"], } # verify groups cover all phonemes phoneme_groups_flat = [p for g in phoneme_groups for p in phoneme_groups[g]] extra_phonemes = set(phoneme_groups_flat)- set(phoneme_mapped_index.keys()) print(f"extra phonemes: {extra_phonemes}") missing_phonemes = set(phoneme_mapped_index.keys()) - set(phoneme_groups_flat) print(f"missing phonemes: {missing_phonemes}") assert len(missing_phonemes) == 0, "Phoneme groups do not cover all phonemes" # remove extra phonemes: for p in extra_phonemes: for g in phoneme_groups: if p in phoneme_groups[g]: phoneme_groups[g].remove(p) # covert groups to index phoneme_groups_based = {} for g in phoneme_groups: phoneme_groups_based[g] = [phoneme_mapped_index[p] for p in phoneme_groups[g]] # verify groups are correctly mapped for g in phoneme_groups: for p in phoneme_groups[g]: assert phoneme_mapped_index[p] in phoneme_groups_based[g], f"{p} not in {g}" global phoneme_groups_index # clear phoneme_groups_index = {} phoneme_groups_index = { "SIL": 0,} for i, g in enumerate(phoneme_groups): if (g != "SIL") and (g != "noise"): phoneme_groups_index[g] = i+1 phoneme_groups_index["noise"] = len(phoneme_groups_index) print("phoneme_groups_index:", phoneme_groups_index) print("total groups (excluding noise)", len(phoneme_groups_index)-1) # base phonemes index to group index base66_to_groups = {} for p in phoneme_mapped_index: for g in phoneme_groups: if p in phoneme_groups[g]: base66_to_groups[phoneme_mapped_index[p]] = phoneme_groups_index[g] # verify all phonemes are mapped to a group assert len(base66_to_groups) == len(phoneme_mapped_index), "Not all phonemes are mapped to a group" print("base66_to_groups:", base66_to_groups) #main if __name__ == "__main__": # Create the new index #create_new_index() #check_missing_phonemes() #check_duplicates() make_phoneme_groups()