# this is a prepared index generated from the create_new_index() function

phoneme_mapped_index = {
    # Special token
    'SIL': 0,
    
    # High front vowels and commonly confused similar vowels
    'i': 1,        # High front unrounded
    'i:': 2,       # Long high front unrounded
    'ɨ': 3,        # High central (grouped here due to high confusion with 'i')
    'ɪ': 4,        # Near-high front unrounded
    
    # Mid front vowels
    'e': 5,        # Mid front unrounded
    'e:': 6,       # Long mid front unrounded
    'ɛ': 7,        # Open-mid front unrounded
    
    # Central vowels
    'ə': 8,        # Schwa (mid central)
    'ɚ': 9,        # R-colored schwa
    'ʌ': 10,       # Open-mid back unrounded
    
    # Back vowels
    'u': 11,       # High back rounded
    'u:': 12,      # Long high back rounded
    'ʊ': 13,       # Near-high back rounded
    'ɯ': 14,       # High back unrounded
    'o': 15,       # Mid back rounded
    'o:': 16,      # Long mid back rounded
    'ɔ': 17,       # Open-mid back rounded
    
    # Low vowels
    'a': 18,       # Open central/front unrounded
    'a:': 19,      # Long open central/front unrounded
    'æ': 20,       # Near-open front unrounded
    
    # Front rounded vowels
    'y': 21,       # High front rounded
    'ø': 22,       # Mid front rounded
    
    # Diphthongs
    'aɪ': 23,      # Open central to high front
    'eɪ': 24,      # Mid front to high front
    'aʊ': 25,      # Open central to high back
    'oʊ': 26,      # Mid back to high back
    'ɔɪ': 27,      # Open-mid back to high front
    
    # Stops (organized by place of articulation)
    'p': 28,       # Voiceless bilabial
    'b': 29,       # Voiced bilabial
    't': 30,       # Voiceless alveolar
    'd': 31,       # Voiced alveolar
    'k': 32,       # Voiceless velar
    'g': 33,       # Voiced velar
    'q': 34,       # Voiceless uvular
    
    # Affricates and related sibilant fricatives (grouped by similarity)
    'ts': 35,      # Voiceless alveolar affricate
    's': 36,       # Voiceless alveolar fricative
    'z': 37,       # Voiced alveolar fricative
    'tʃ': 38,      # Voiceless postalveolar affricate
    'dʒ': 39,      # Voiced postalveolar affricate
    'ʃ': 40,       # Voiceless postalveolar fricative
    'ʒ': 41,       # Voiced postalveolar fricative
    'ɕ': 42,       # Voiceless alveolo-palatal fricative
    
    # Other fricatives (organized by place)
    'f': 43,       # Voiceless labiodental
    'v': 44,       # Voiced labiodental
    'θ': 45,       # Voiceless dental
    'ð': 46,       # Voiced dental
    'ç': 47,       # Voiceless palatal
    'x': 48,       # Voiceless velar
    'ɣ': 49,       # Voiced velar
    'h': 50,       # Voiceless glottal
    'ʁ': 51,       # Voiced uvular
    
    # Nasals (organized by place)
    'm': 52,       # Bilabial
    'n': 53,       # Alveolar
    'ɲ': 54,       # Palatal
    'ŋ': 55,       # Velar
    
    # Liquids and approximants
    'l': 56,       # Alveolar lateral
    'ɭ': 57,       # Retroflex lateral
    'ɾ': 58,       # Alveolar tap
    'ɹ': 59,       # Alveolar approximant
    'j': 60,       # Palatal approximant
    'w': 61,       # Labial-velar approximant
    
    # Palatalized consonants
    'tʲ': 62,      # Palatalized t
    'nʲ': 63,      # Palatalized n
    'rʲ': 64,      # Palatalized r
    'ɭʲ': 65,      # Palatalized retroflex lateral
    
    # Special token
    'noise': 66
}


phoneme_groups_mapper = {0: 0, 1: 1, 2: 1, 3: 3, 4: 1, 5: 1, 6: 1, 7: 1, 8: 2, 9: 2, 10: 2, 11: 3, 12: 3, 13: 3, 14: 3, 15: 3, 16: 3, 17: 3, 18: 4, 19: 4, 20: 4, 21: 1, 22: 1, 23: 5, 24: 5, 25: 5, 26: 5, 27: 5, 28: 6, 29: 7, 30: 6, 31: 7, 32: 6, 33: 7, 34: 6, 35: 10, 36: 8, 37: 9, 38: 10, 39: 11, 40: 8, 41: 9, 42: 8, 43: 8, 44: 9, 45: 8, 46: 9, 47: 8, 48: 8, 49: 9, 50: 8, 51: 9, 52: 12, 53: 12, 54: 12, 55: 12, 56: 13, 57: 13, 58: 14, 59: 14, 60: 15, 61: 15, 62: 6, 63: 12, 64: 14, 65: 13, 66: 16}


phoneme_groups_index = {'SIL': 0, 'front_vowels': 1, 'central_vowels': 2, 'back_vowels': 3, 'low_vowels': 4, 'diphthongs': 5, 'voiceless_stops': 6, 'voiced_stops': 7, 'voiceless_fricatives': 8, 'voiced_fricatives': 9, 'voiceless_affricates': 10, 'voiced_affricates': 11, 'nasals': 12, 'laterals': 13, 'rhotics': 14, 'glides': 15, 'noise': 16}


# vocab counts from train_100h*8_langs, OpenSLR-MLS data 100hours each
complete_vocab = {'SIL': 4914031, 'a': 1604572, 'n': 1501496, 't': 1345451, 's': 1242856, 'i': 1207390, 'e': 985568, 'o': 850470, 'm': 840466, 'l': 840200, 'r': 825931, 'd': 821689, 'k': 814493, 'ɛ': 700232, 'p': 607786, 'ə': 492948, 'v': 432914, 'j': 430514, 'u': 422499, 'ɾ': 419723, 'b': 413539, 'ɑ': 399576, 'ɔ': 344276, 'ʌ': 334025, 'ɪ': 294767, 'f': 292469, 'z': 286215, 'ɡ': 267606, 'ʃ': 249935, 'ɐ': 247989, 'w': 222719, 'ʊ': 200737, 'h': 189391, 'ʁ': 161395, 'ð': 159285, 'ɨ': 157902, 'x': 151422, 'eː': 141335, 'y': 138328, 'iː': 135167, 'ŋ': 118468, 'aɪ': 104170, 'ts': 96727, 'ɹ': 96111, 'æ': 83801, 'tʃ': 83484, 'θ': 81846, 'ʒ': 81251, 'uː': 79788, 'aː': 69487, 'ɕ': 68197, 'β': 67991, 'oː': 67190, 'ɑː': 66515, 'ɣ': 64902, 'eɪ': 63049, 'tʲ': 60946, 'ø': 58520, 'ɭ': 58455, 'nʲ': 56439, 'dʒ': 54175, 'ɑ̃': 53516, 'aʊ': 53297, 'q': 49319, 'ɲ': 48479, 'rʲ': 46853, 'ɭʲ': 45521, 'ɔ̃': 44985, 'ɯ': 43339, 'sʲ': 38535, 'ɲʲ': 38014, 'ɒ': 37682, 'vʲ': 37231, 'ʎ': 37145, 'ç': 35610, 'ʋ': 32705, 'ɚ': 32019, 'tɕ': 32006, 'mʲ': 31189, 'dʲ': 29989, 'ɜ': 27714, 'ja': 27523, 'ʔ': 26931, 'oʊ': 26398, 'ɑɨ': 24604, 'tʃʲ': 24301, '1': 23766, 'dʑ': 22131, 'ɛɪ': 21834, 'tː': 21127, 'ᵻ': 20631, 'ɛ̃': 20508, 'uɨ': 20396, 'ɫ': 19763, 'ɬ': 19662, 'ʑ': 18169, 'œ': 17838, 'oɪ': 16897, 'ɔɨ': 16178, 'ɔː': 15555, 'ɐ̃': 15510, 'ɨː': 15065, 'ɜː': 14661, 'ju': 14500, 'pʲ': 14429, 'aɨ': 13971, 'əl': 13858, 'ɵ': 13512, 'kʲ': 13204, 'ss': 12685, 'ɐ̃ʊ̃': 12237, 't[': 12211, 'əɪ': 12209, 'ɑːɹ': 10053, 'bʲ': 9958, 'd[': 9275, 'yː': 9033, 'eʊ': 8931, 'ɨu': 7653, 'ɡʲ': 7633, 'ɔːɹ': 7072, '(en)': 6245, 'œy': 6018, 'kː': 5801, 'əɨ': 5629, 'ɔø': 5613, 'oːɹ': 5458, 'u"': 5356, 'fʲ': 5315, 'pː': 5307, 'ɛɹ': 5299, 'ɪː': 5068, '??': 5027, 'ɛː': 4894, 'øː': 4796, 'ɔɪ': 4603, 'dZ': 4345, 'ɪu': 4320, 'c': 4200, 'S': 4197, 'ʕ': 4050, '(fr)': 3885, 'ʌʊ': 3787, 'tS': 3731, 'oe': 3730, 'iə': 3654, 'dʒː': 3441, 'ɪɹ': 3101, 'r̝̊': 3099, 'bː': 3051, 'ɟ': 2761, 'uɪ': 2632, 'ʊɹ': 2589, 'tʃː': 2564, 'ħ': 2318, 'ũ': 2158, 's^': 2080, 't̪': 1778, 'r̝': 1702, 'ɪ^': 1651, 'tsː': 1558, 'dzː': 1479, 'r̩': 1272, 'u:': 1189, 'aɪɚ': 1180, '(de)': 1166, 's̪': 1070, 'dz': 1027, 'iʊ': 940, 'aɪə': 928, 'dˤ': 920, 'χ': 845, 'æi': 819, 'œ̃': 766, '(it)': 719, 'ɑ:': 715, 'o:': 623, 'n̩': 587, 'l̩': 536, 'æː': 534, 'dː': 532, 'õ': 491, 'N': 490, 'y:': 415, 'pf': 342, 'əʊ': 342, 'ʝ': 323, 't^': 288, 'oe:': 257, '(nl)': 237, 'ɛʊ': 237, '(ptpt)': 196, 'e:': 183, 'eə': 144, 'd^': 131, 'i.ː': 129, 'yʊ': 101, 't^ː': 79, 'nl': 76, '(fa)': 65, 'æiː': 64, 'yi': 63, '(es)': 61, 'dʒʲ': 47, 'qː': 43, '(ru)': 40, 'ɡː': 37, 'ɪuː': 37, 'ʊə': 34, 'X': 31, 'a.ː': 31, 'u.ː': 31, 'rr': 25, 'mb': 25, 'ɵː': 24, 'd̪w': 23, 'ʂ': 22, '(tt)': 22, 'dm': 20, 'daː': 17, 'əː': 17, 'it': 17, 'ɡd': 17, 'mj': 16, 'db': 16, 'wb': 16, 'iːː': 15, 'mt': 15, 'ɑk': 15, 'i:': 15, 'da': 14, 'nb': 14, 'eð': 13, 'mx': 13, 'maː': 13, 'tk': 13, 'niː': 13, 'rb': 13, 'mh': 13, 'dˤdˤ': 13, 'fm': 13, 'nm': 11, 'eːh': 11, 'mtʃ': 11, 'ma': 11, 'ʊːt': 11, 'aːn': 11, 'iːe': 11, 'im': 10, 'eːb': 10, 'np': 10, 'aɪaɪ': 9, 'ʃj': 9, 'eːt': 9, 'jː': 8, 'mv': 8, 'ae': 8, 'ed': 8, 'nn': 8, 'dtʃ': 8, 'ɑh': 8, 'tr': 7, 'sb': 7, 'tn': 7, 'lx': 7, 'eːa': 7, 'il': 7, 'ɑj': 7, 'ɑaː': 7, 'oːs': 7, 'eːq': 7, 'ah': 7, 'bb': 7, 'or': 7, 'ɑm': 7, 'kt': 7, 'as': 7, 'eβ': 6, 'eːd': 6, 'ob': 6, 'eːuː': 6, 'rtʃ': 6, 'ʃd': 6, 'md': 6, 'duː': 6, 'eːaː': 6, 'ɡz': 6, 'hx': 6, 'lk': 6, 'eːf': 6, 'ɑq': 6, 'nk': 6, 'nx': 6, 'nj': 6, 'dɔ': 6, 'is': 6, 'aɪp': 6, 'əm': 6, 'laː': 5, 'tb': 5, 'aa': 5, 'zm': 5, 'ntʃ': 5, 'ep': 5, 'bh': 5, 'lh': 5, 'do': 5, 'eːp': 5, 'miː': 5, 'tuː': 5, 'lm': 5, 'mr': 5, 'sz': 5, 'nv': 4, 'in': 4, 'ɑuː': 4, 'iaː': 4, 'oːb': 4, 'uːm': 4, 'naː': 4, 'eːs': 4, 'itʃ': 4, 'eːv': 4, 'az': 4, 'ha': 4, 'dɡ': 4, 'hb': 4, 'mf': 4, 'ɑn': 4, 'ia': 4, 'lb': 4, 'na': 4, 'ld': 4, 'd̪': 4, 'ndʒ': 3, 'uːk': 3, 'uːb': 3, 'mo': 3, 'ɡh': 3, 'mk': 3, 'dk': 3, 'eːtʃ': 3, 'dp': 3, 'tiː': 3, 'bv': 3, 'ta': 3, 'th': 3, 'ip': 3, 'eːj': 3, 'eːx': 3, 'mz': 3, 'eːz': 3, '(pl)': 2, 'diː': 2, 'eh': 2, 'biː': 2, 'mʃ': 2, 'mn': 2, 'dʒv': 2, 'hʃ': 2, 'liː': 2, 'mɡ': 2, 'jiː': 2, 'ms': 2, 'ʃz': 2, 'ne': 2, 'on': 2, 'raː': 2, 'nh': 2, 'eːʃ': 2, 'rm': 2, 'ʊː': 2, 'laɪ': 1, 'ɑa': 1, 'td': 1, 'eːe': 1, 'mp': 1, 'ɑt': 1, 'nr': 1, 'me': 1, 'ɑo': 1, 'ik': 1, 'iːv': 1, 'dh': 1, 'eːɑ': 1, 'dl': 1, 'ʃdʒ': 1, 'ns': 1}


base_phonemes = [
    # New:
    'x', 'ç', 'ɣ', 'ʁ', 'ts', 'tʃ', 'ʌ',  'ɭ',
    # Stops
    'p', 'b', 't', 'd', 'k', 'ɡ', 'ʔ',
    
    # Fricatives
    'f', 'v', 'θ', 'ð', 's', 'z', 'ʃ', 'ʒ', 'h', 'ʂ', 'ʐ', 'ɕ', 'ʑ', 'ʁ',
    
    # Affricates
    'ts', 'dz', 'tɕ', 'dʑ', 'tʃ', 'dʒ', 'pf',
    
    # Nasals
    'm', 'n', 'ŋ',
    
    # Liquids/Glides
    'l', 'ɫ', 'ɹ', 'j', 'w', 'ɾ',
    
    # Palatalized
    'nʲ', 'lʲ', 'rʲ',
    
    # Vowels
    'i', 'iː', 'ɪ', 'e', 'eː', 'ɛ', 'æ',
    'a', 'aː', 'ɑː',
    'o', 'ɔː', 'ʊ',
    'u', 'uː',
    'ə', 'ɚ', 'ʌ', 'ɨ', 'ɤ', 'ɯ',
    'œ', 'ø', 'øː', 'ʏ', 'yː',
    
    # Diphthongs
    'eɪ', 'aɪ', 'ɔɪ', 'oʊ', 'aʊ',
    
    # Other
    'ɒ', 'ɜː', 'ɲ',

    'noise', 'SIL'
]

phoneme_mapping = {
    
    # Core vowels - simplified based on confusion patterns
    'ə': 'ə', 
    #'ʌ': 'ə',  # Merge due to high confusion
    'ʌ': 'ʌ', # didn't work well before but still keep it
    'ɪ': 'ɪ', 'i': 'i', 
    'iː': 'i:',
    'ʊ': 'ʊ',
    'u': 'u',
    'uː': 'u:',
    'ɛ': 'ɛ', 'e': 'e', 'eː': 'e:',
    'ɔː': 'ɔ', 'ɔ': 'ɔ',
    #'ɒ': 'ɒ', # Merge to 'a' due to 100% wrong predictions in confusion matrix (23 Jan)
    'ɒ': 'a',
    'æ': 'æ', # DO NOT merge
    
    'ɑː': 'a:', 
    'ɑ': 'a', 
    'a': 'a',
    'ɜː': 'ʌ',
    'ɜ': 'ʌ',
    'ɚ': 'ɚ',
    'o': 'o',
    'ɨ': 'ɨ',
    
    # Common diphthongs - keep distinct ones
    'eɪ': 'eɪ', 'aɪ': 'aɪ', 'ɔɪ': 'ɔɪ',
    'aʊ': 'aʊ', 'oʊ': 'oʊ',
    
    # Less common diphthongs - map to similar common ones
    'ʌʊ': 'aʊ', 'eʊ': 'aʊ', 'ɛʊ': 'aʊ', 'əʊ': 'oʊ',
    'ɛɪ': 'eɪ', 'ʊɪ': 'aɪ', 'ea': 'eɪ',
    'aʊ̯': 'aʊ', 'aɪ̯': 'aɪ', 'ɔʏ̯': 'ɔɪ',
    
    # Core consonants
    'p': 'p', 'b': 'b', 't': 't', 'd': 'd',
    'k': 'k', 'g': 'g', 'm': 'm', 'n': 'n',
    'ŋ': 'ŋ', 'f': 'f', 'v': 'v', 'θ': 'θ',
    'ð': 'ð', 's': 's', 'z': 'z', 'ʃ': 'ʃ',
    'ʒ': 'ʒ', 'h': 'h', 'l': 'l', 'ɹ': 'ɹ',
    'j': 'j', 'w': 'w', 'ɲ': 'ɲ', 'ɾ': 'ɾ',
    
    # Consonant mergers based on confusion
    # 'ɣ': 'g',      # Merge with closest stop
    'ɣ': 'ɣ',  # emprically confused but will keep it
    #'ʁ': 'ɹ',      # Map to rhotic
    'ʁ': 'ʁ',
    'r': 'ɹ',      # Map to rhotic
    #'x': 'h',      # Map to closest fricative
    'x': 'x',
    #'ç': 'ʃ',      # Map to closest fricative
    #'ç': 's',      # Based on empirical confusion
    'ç': 'ç',
    'ʂ': 'ʃ',      # Map to closest fricative
    'ʐ': 'ʒ',      # Map to closest fricative
    #'ɕ': 'ʃ',      # Map to closest fricative
    'ɕ': 'ɕ',       # keep it
    'ʑ': 'ʒ',      # Map to closest fricative
    
    # Simplify affricates to their primary component
    #'ts': 't',
    'ts': 'ts',
    'dz': 'dʒ',
    #'tʃ': 'ʃ',
    'tʃ': 'tʃ',
    #'dʒ': 'ʒ',
    'dʒ': 'dʒ',
    'tɕ': 'tʃ',
    'dʑ': 'dʒ',
    'pf': 'f',
    
    #'tʲ': 't', 
    'tʲ': 'tʲ',  # high freuqncy, keep it
    #'nʲ': 'n', 
    'nʲ': 'nʲ', # high freuqncy, keep it
    #'rʲ': 'ɹ',
    'rʲ': 'rʲ', # high freuqncy, keep it
    # Remove palatalization
    'lʲ': 'l',  
    'dʲ': 'd', 'sʲ': 's', 'vʲ': 'v',
    'fʲ': 'f', 'mʲ': 'm',
    'pʲ': 'p', 'kʲ': 'k', 'bʲ': 'b',
    'ɲʲ': 'ɲ', 'dʒʲ': 'dʒ',
    
    # Simplify geminate consonants
    'tː': 't', 'dː': 'd', 'kː': 'k',
    'gː': 'g', 'pː': 'p', 'bː': 'b',
    'fː': 'f', 'vː': 'v', 'sː': 's',
    'zː': 'z', 'ʃː': 'ʃ', 'ʒː': 'ʒ',
    'mː': 'm', 'nː': 'n', 'ŋː': 'ŋ',
    'lː': 'l', 'rː': 'ɹ', 'jː': 'j',
    
    # Nasal vowels to oral counterparts
    'ɑ̃': 'a', 'ɛ̃': 'ɛ', 'ɔ̃': 'ɔ',
    'ũ': 'u', 'õ': 'oʊ', 'ɐ̃': 'ʌ',
    
    # R-colored vowels
    'ɑːɹ': 'ɚ', 'ɔːɹ': 'ɚ',
    'ʊɹ': 'ɚ', 'ɪɹ': 'ɚ', 'ɛɹ': 'ɚ',
    'oːɹ': 'ɚ',
    
    # Vowel sequences
    'ia': 'i:', 'ua': 'u:',
    'ɔø': 'ɔ', 'iːɛ': 'i:',
    'ʊə': 'ʊ', 'iə': 'i:',
    'eə': 'ɛ',
    
    # Common sequences
    # 'əl': 'əl',  # Keep this distinct sequence
    #'əl': 'o', # based on empirical confusion. theoretically, this should be merged with 'l' or 'e' but it's most confused with 'o'
    'əl': 'l',
    'n̩': 'n',
    'ʃf': 'ʃ',
    'eð': 'ð',
    'ns': 'n',
    'nd': 'n',
    'ʃts': 'ts',
    
    # Special symbols
    'SIL': 'SIL',
    'noise': 'noise',   # noise will be ignored by the model. CTC will take it as blank token.
    '': 'SIL',
    'ʔ': 'noise',
    
    # Language markers to silence
    '(en)': 'SIL', '(es)': 'SIL', '(fr)': 'SIL',
    '(de)': 'SIL', '(it)': 'SIL', '(nl)': 'SIL',
    '(pl)': 'SIL', '(ru)': 'SIL', '(ptpt)': 'SIL',
    
    # Error cases to noise
    '??': 'noise', 'uk': 'noise', 'it': 'noise',
    'ɡd': 'noise', 'rd': 'noise', 'as': 'noise',
    'up': 'noise', 'os': 'noise', 'kf': 'noise',
    '1': 'noise', 'ʃd': 'noise', 'ʃz': 'noise',
    'ʃn': 'noise',


    # Vowels
    'y': 'y',        # Map to existing long form
    'yː': 'y',       # Keep distinct high front rounded vowel
    'œ': 'ø',         # Map to closest unrounded vowel
    'ø': 'ø',        # Map to long version
    'øː': 'ø',       # Keep distinct mid front rounded vowel
    'ɐ': 'ʌ',         # Map to schwa
    'aː': 'a:',       # Keep long a
    #'oː': 'ɔ',       # Map to similar long vowel
    'oː': 'o:',       # Keep distinct long o
    'ɛː': 'ɛ',        # Map to base form
    'ɪː': 'i:',       # Map to similar long vowel
    'ɵ': 'ʊ',         # Map to closest vowel
    'ᵻ': 'ɪ',         # Map to similar vowel
    
    # Double vowels (map to their long counterparts)
    'aa': 'a',
    'ɐɐ': 'a',
    'ææ': 'æ',
    
    # Diphthongs
    'yʊ': 'u',       # Map to similar monophthong
    'œy': 'ɔɪ',       # Map to similar diphthong
    'uɪ': 'aɪ',       # Map to existing diphthong
    'oɪ': 'ɔɪ',       # Map to similar diphthong
    'iʊ': 'u',       # Map to similar monophthong
    'aɪə': 'aɪ',      # Map to base diphthong
    'aɪɚ': 'aɪ',      # Map to base diphthong
    
    # Nasal vowels
    'ɐ̃ʊ̃': 'aʊ',      # Map to oral diphthong
    'œ̃': 'ɛ',         # Map to oral vowel
    
    # Consonants
    'ʝ': 'j',         # Map to similar approximant
    'ɟ': 'ʒ',        # Map to similar affricate
    'ʋ': 'v',         # Map to similar fricative
    'd̪': 'd',         # Map dental to alveolar
    't̪': 't',         # Map dental to alveolar
    'ɬ': 'l',         # Map to plain lateral
    'ʎ': 'l',         # Map to plain lateral
    'β': 'v',         # Map to similar fricative
    'ɡ': 'g',         # Standardize to 'g'
    
    # Geminate consonants
    'ɡː': 'g',        # Map to single consonant
    'tsː': 'ts',      # Map to single affricate
    'dzː': 'd',      # Map to single affricate
    #'tʃː': 'ʃ',      # Map to single affricate
    'tʃː': 'tʃ',      # Map to single affricate
    'dʒː': 'dʒ',      # Map to single affricate
    'ss': 's',        # Map to single consonant
    
    # Palatalized consonants
    'ɡʲ': 'g',        # Map to plain consonant
    
    # Sequences
    'dɔ': 'noise',     # Map unusual sequence to noise


    # These are found (with counts) in Google MSWC data, but not in the OpenSLR-MLS data
    # Complex sequences with frequency counts
    'ja': 'j',      # Common sequence (36,809) -> simplify to first component
    'ju': 'j',      # Common sequence (19,620) -> simplify to first component  
    'tʃʲ': 'tʃ',     # Common palatalized affricate (32,707) -> map to fricative
    #'ɭ': 'l',       # Very common retroflex lateral (78,504) -> map to alveolar
    'ɭ': 'ɭ',
    'ɭʲ': 'ɭʲ',      # Common palatalized retroflex (61,298) -> map to plain lateral
    'u"': 'u',     # Quote variant (7,265) -> normalize to standard long u
    'ɪ^': 'ɪ',      # Rare diacritic variant (2,222) -> remove diacritic
    'sz': 's',      # Rare sequence (5) -> simplify to first component
    #'q': 'k',       # Common uvular stop (75,838) -> map to velar
    'q': 'q',     # keep it EVEN though it's relively rare (45k)
    #'qː': 'k',      # Rare long uvular (103) -> map to velar
    'qː': 'q',
    'r̝̊': 'ɹ',      # Rare trilled/fricative r (3,099) -> map to approximant
    'r̝': 'ɹ',      # Rare variant (1,702) -> map to approximant
    'r̩': 'ɹ',      # Rare syllabic (1,272) -> map to approximant
    'l̩': 'l',      # Rare syllabic (536) -> map to standard lateral
    'c': 'k',       # Uncommon palatal stop (4,195) -> map to velar

    # Vowel sequences
    'uɨ': 'ɨ',     # Common sequence (20,396) -> map to monophthong
    'aɨ': 'aɪ',     # Common sequence (13,971) -> map to similar diphthong
    'ɨu': 'u:',     # Less common (7,653) -> map to monophthong
    'ɪu': 'u:',     # Uncommon (4,320) -> map to monophthong
    'ɨː': 'ɨ',      # Common variant (15,065) -> remove length marker
    'ɑɨ': 'aɪ',     # Common sequence (24,604) -> map to diphthong
    'əɪ': 'eɪ',     # Common sequence (12,209) -> map to similar diphthong
    'əɨ': 'ɨ',      # Less common (5,629) -> simplify to first component
    'ɔɨ': 'ɔɪ',     # Common sequence (16,178) -> map to similar diphthong
    'ɪuː': 'u:',    # Rare sequence (37) -> map to monophthong

    # Rare sequences: 1-5 occurrences ---------------------------------------
    # Some of the extremely rare consonant-consonant and vowel-consonant sequences map to 'noise' (i.e., ignored), most don't.
    
    # More nasal sequences
    'nm': 'n',    # was 'noise', map to alveolar nasal
    'nn': 'n',    # was 'noise', map to single nasal
    'mn': 'm',    # was 'noise', map to bilabial nasal
    'mm': 'm',    # was 'noise', map to single nasal
    'na': 'n',    # was 'noise', preserve nasal
    'maː': 'm',   # was 'noise', preserve nasal
    'mz': 'm',    # was 'noise', preserve nasal
    'ms': 'm',    # was 'noise', preserve nasal
    'mf': 'm',    # was 'noise', preserve nasal
    'mɡ': 'm',    # was 'noise', preserve nasal
    'mx': 'm',    # was 'noise', preserve nasal
    'mv': 'm',    # was 'noise', preserve nasal
    'mʃ': 'm',    # current mapping is good
    
    # Stop sequences
    'dk': 'd',    # was 'noise', preserve first stop
    'dp': 'd',    # was 'noise', preserve first stop
    'db': 'd',    # was 'noise', preserve first stop
    'td': 't',    # was 'noise', preserve first stop
    'tb': 't',    # was 'noise', preserve first stop
    'tn': 't',    # was 'noise', preserve stop
    
    # Long vowel sequences
    'eːs': 'e:',  # was 'noise', preserve long vowel
    'eːt': 'e:',  # was 'noise', preserve long vowel
    'eːp': 'e:',  # was 'noise', preserve long vowel
    'eːf': 'e:',  # current mapping is good
    'eːz': 'e:',  # current mapping is good
    'eːj': 'e:',  # current mapping is good
    'eːx': 'e:',  # current mapping is good
    'eːʃ': 'e:',  # current mapping is good
    'oːs': 'o:',  # current mapping is good
    'oːb': 'o:',  # current mapping is good
    
    # Vowel sequences
    'ɑj': 'aɪ',   # was 'noise', map to diphthong
    'ɑh': 'a',    # was 'noise', preserve vowel
    'ɑm': 'a',    # was 'noise', preserve vowel
    'ɑk': 'a',    # was 'noise', preserve vowel
    'ɑn': 'a',    # was 'noise', preserve vowel
    'ɑq': 'a',    # was 'noise', preserve vowel
    'ɑt': 'a',    # was 'noise', preserve vowel
    'ɑo': 'a',    # was 'noise', preserve first vowel
    'ɑa': 'a',    # was 'noise', preserve first vowel
    'ɑaː': 'a:',  # was 'noise', map to long vowel
    'ɑuː': 'aʊ',  # was 'noise', map to diphthong
    
    # Other sequences
    'dʒv': 'dʒ',  # current mapping is good
    'bv': 'b',    # was 'noise', preserve stop
    'bh': 'b',    # was 'noise', preserve stop
    'ɡh': 'g',    # was 'noise', preserve stop
    'ɡz': 'g',    # was 'noise', preserve stop
    'hx': 'x',    # current mapping is good
    'ʃj': 'ʃ',    # was 'noise', preserve fricative
    
    # Special cases
    '(fa)': 'SIL',  # current mapping is good
    'bb': 'b',      # current mapping is good

    'uːb': 'u:',
    'uːk': 'u:',
    'laɪ': 'noise',
    # ---------------------------------------  End of rare sequences

    # Vowels and length variants
    'əː': 'ə',         # Long schwa maps to schwa (index 18)
    'æː': 'æ',         # Long ash maps to ash (index 32)
    'æi': 'eɪ',        # Map to similar diphthong (index 23)
    'æiː': 'eɪ',       # Map to similar diphthong (index 23)
    'ɵː': 'ʊ',         # Long rounded vowel maps to nearest equivalent (index 22)
    #'ɯ': 'ʊ',         # Unrounded high back vowel maps to nearest equivalent (index 22)
    'ɯ': 'ɯ',

    # Alternative transcription formats
    'e:': 'e:',        # Long e
    'eː': 'e:',        # Normalize colon to IPA length mark (index 43)
    #'e:': 'e',          # NOT merged due to high confusion
    'o:': 'o:',
    'y:': 'y',        # Normalize colon to IPA length mark (index 39)
    'u:': 'u:',        # Normalize colon to IPA length mark (index 5)
    'i:': 'i:',        # Normalize colon to IPA length mark (index 12)
    'ɑ:': 'a',        # Normalize colon to IPA length mark (index 13)
    'oe:': 'ø',       # Normalize colon to IPA length mark (index 40)
    'oe': 'ø',        # Map to equivalent (index 40)
    
    # ASCII-based transcription variants
    'S': 's',          # ASCII variant of 's' (index 21)
    'N': 'n',          # ASCII variant of 'n' (index 11)
    'X': 'k',          # ASCII variant, typically representing 'k' (index 27)
    'tS': 'tʃ',         # ASCII variant of 'tʃ' (index 1)
    'dZ': 'dʒ',         # ASCII variant of 'dʒ' (index 2)
    
    # Special characters and diacritics
    't^': 't',         # Remove diacritic (index 4)
    's^': 's',         # Remove diacritic (index 21)
    'd^': 'd',         # Remove diacritic (index 9)
    't^ː': 't',        # Remove diacritic and length (index 4)
    't[': 't',         # Remove bracket notation (index 4)
    'd[': 'd',         # Remove bracket notation (index 9)
    
    # Arabic phonemes
    'ʕ': 'h',          # Voiced pharyngeal fricative maps to nearest fricative (index 37)
    'ħ': 'h',          # Voiceless pharyngeal fricative maps to 'h' (index 37)
    'dˤ': 'd',         # Pharyngealized 'd' maps to plain 'd' (index 9)
    's̪': 's',          # Dental 's' maps to plain 's' (index 21)
    'χ': 'x',          # Voiceless uvular fricative maps to 'h' (index 37)
    'dˤdˤ': 'd',       # Doubled pharyngealized 'd' maps to 'd' (index 9)
    'dd': 'd',         # ASCII variant of doubled/pharyngealized 'd' (index 9)
    
    # Dot notation variants
    'i.ː': 'i:',       # Normalize dot notation (index 12)
    'a.ː': 'a:',       # Normalize dot notation (index 13)
    'u.ː': 'u:',       # Normalize dot notation (index 5)
    
    # Lateral approximant variant
    'ɫ': 'l',          # Velarized lateral maps to plain 'l' (index 16)
    
    # Consonant sequences (map to noise)
    'kt': 'noise',     # Consonant sequence (index 50)
    'd̪w': 'noise',     # Consonant sequence (index 50)
    'wb': 'noise',     # Consonant sequence (index 50)
    'fm': 'noise',     # Consonant sequence (index 50)
    
    # Vowel-consonant sequences (map to noise)
    'ʊːt': 'noise',    # Vowel-consonant sequence (index 50)
    'aɪp': 'noise',    # Vowel-consonant sequence (index 50)
    'əm': 'noise',     # Vowel-consonant sequence (index 50)
    'aːn': 'a:',    # Vowel-consonant sequence (index 50)
    'iːe': 'i:',    # Vowel-vowel sequence (index 50)
    'yi': 'i:',     # Vowel-vowel sequence (index 50)
    
    # Language markers (map to SIL)
    '(tt)': 'SIL',      # Language marker (index 0)

    # Double long vowel - map to standard long vowel
    'iːː': 'i:',       # Excessive length mark, normalize to standard long i (index 12)
    
    # Doubled diphthong - map to single diphthong
    'aɪaɪ': 'aɪ',      # Repeated diphthong, map to single instance (index 7)
    
    # Consonant sequences - map to noise like other sequences
    'ndʒ': 'dʒ',    # Consonant cluster (index 50)
    'tr': 'noise',     # Consonant cluster (index 50)
    'eβ': 'noise',     # Vowel-consonant sequence (index 50)


    # Double palatalization - map to single palatalized form then apply existing mappings
    'ʂʲ': 'ʃ',         # Map palatalized retroflex to palato-alveolar (index 1)
    'nʲʲ': 'nʲ',        # Double palatalized nasal to plain nasal (index 11)
    'tsʲ': 'ts',        # Palatalized affricate follows affricate mapping (index 4)
    'xʲ': 'h',         # Palatalized velar fricative to h (index 37)
    'dʑʲ': 'dʒ',        # Palatalized voiced affricate to voiced palato-alveolar (index 2)
    'ɕʲ': 'ɕ',         # Palatalized alveolo-palatal to palato-alveolar (index 1)
    'tɕʲ': 'ʃ',        # Palatalized affricate to palato-alveolar (index 1)
    'tʲʲ': 'tʲ',        # Double palatalized stop to plain stop (index 4)
    'ʒʲ': 'ʒ',        # Palatalized palato-alveolar remains (index 2)
    'ʃʲʲ': 'ʃ',        # Double palatalized palato-alveolar remains (index 1)
    'tsʲʲ': 'ts',       # Double palatalized affricate to stop (index 4)
    'ɾʲʲ': 'ɾ',        # Double palatalized tap remains (index 48)
    'zʲʲ': 'z',        # Double palatalized fricative remains (index 36)
    'ɾʲ': 'rʲ',         # Palatalized tap remains (index 48)
    'ʃʲ': 'ʃ',         # Palatalized palato-alveolar remains (index 1)
    'mʲʲ': 'm',        # Double palatalized nasal to plain (index 28)
    'ʲ': 'noise',      # Isolated palatalization mark to noise (index 50)

    # Vowel sequences - map to nearest phoneme or diphthong
    'uo': 'oʊ',        # Map to nearest diphthong (index 24)
    'ee': 'i:',        # Map to long vowel (index 12)
    'ie': 'i:',        # Map to long vowel (index 12)
    'ai': 'aɪ',        # Map to standard diphthong (index 7)
    'ui': 'u:',        # Map to long vowel (index 5)
    'au': 'aʊ',        # Map to standard diphthong (index 8)
    'eɑ': 'ɛ',         # Map to nearest monophthong (index 6)
    'iu': 'u:',        # Map to long vowel (index 5)
    'auː': 'aʊ',       # Map to standard diphthong (index 8)
    'ei': 'eɪ',        # Map to standard diphthong (index 23)
    'eu': 'oʊ',        # Map to nearest diphthong (index 24)
    'aiː': 'aɪ',       # Map to standard diphthong (index 7)
    'iuː': 'u:',       # Map to long vowel (index 5)
    'eiː': 'eɪ',       # Map to standard diphthong (index 23)
    'euː': 'oʊ',       # Map to nearest diphthong (index 24)
    'ɔa': 'ɔ',        # Map to long vowel (index 3)
    'yɪ': 'y',        # Map to long vowel (index 39)
    'iɪ': 'i:',        # Map to long vowel (index 12)
    'eo': 'oʊ',        # Map to nearest diphthong (index 24)

    # Special notations
    'cː': 'k',         # Long palatal stop to velar (index 27)

    # All Chinese tonal patterns (with numbers) and complex sequences map to 'noise'
    # Examples:
    'iɜk': 'noise', 'onɡ5': 'noise', 'ts.': 'ts', 'ə5': 'noise',
    'ŋf': 'noise', 'u2': 'noise', 'oɜɕ': 'noise', 'iɜ': 'noise',

    # MLS-fr
    # Consonant sequences to noise
    'ls': 'noise',     # Lateral + fricative sequence maps to noise (50)
    'll': 'noise',     # Double lateral sequence maps to noise (50)
    
    # Vowel-consonant sequences to noise
    'øːl': 'noise',    # Long oe + lateral sequence maps to noise (50)
    'øːs': 'noise',    # Long oe + fricative sequence maps to noise (50)


    # from UCLA Phonetics Dataset

    # Syllabic consonants - map to their non-syllabic counterparts
    'h̩': 'h',      # Syllabic h to h (37)
    'ɹ̩': 'ɹ',      # Syllabic r to r (17)
    'ŋ̩': 'ŋ',      # Syllabic ng to ng (34)
    'ɫ̩': 'l',      # Syllabic dark l to l (16)
    'v̩': 'v',      # Syllabic v to v (15)
    'm̩': 'm',      # Syllabic m to m (28)

    # Aspirated consonants - map to unaspirated counterparts
    'pʰ': 'p',     # Aspirated p to p (25)
    'tʰ': 't',     # Aspirated t to t (4)
    'kʰ': 'k',     # Aspirated k to k (27)
    'sʰ': 's',     # Aspirated s to s (21)
    'ʃʰ': 'ʃ',     # Aspirated sh to sh (1)
    'cʰ': 'k',     # Aspirated c to k (27)
    't͡sʰ': 'ts',    # Aspirated ts to t (4)
    't͡ʃʰ': 'tʃ',    # Aspirated tsh to sh (1)
    'ɕʰ': 'ɕ',     # Aspirated alveolo-palatal to sh (1)

    # Labialized consonants - map to base consonants
    'tʷ': 't',     # Labialized t to t (4)
    'kʷ': 'k',     # Labialized k to k (27)
    'pʷ': 'p',     # Labialized p to p (25)
    'ʒʷ': 'ʒ',     # Labialized zh to zh (2)
    'xʷ': 'h',     # Labialized x to h (37)
    'dʷ': 'd',     # Labialized d to d (9)
    'bʷ': 'b',     # Labialized b to b (26)
    'mʷ': 'm',     # Labialized m to m (28)
    'ŋʷ': 'ŋ',     # Labialized ng to ng (34)
    
    # Retroflexes - map to closest non-retroflex
    'ʈ': 't',      # Retroflex t to t (4)
    'ɖ': 'd',      # Retroflex d to d (9)
    'ɳ': 'n',      # Retroflex n to n (11)
    'ɻ': 'ɹ',      # Retroflex r to r (17)
    'ɽ': 'ɾ',      # Retroflex flap to tap (48)

    # Breathy voiced - map to regular voiced
    'n̤': 'n',      # Breathy n to n (11)
    'b̤': 'b',      # Breathy b to b (26)
    'j̤': 'j',      # Breathy j to j (29)
    'a̤': 'a',     # Breathy a to long a (30)
    'i̤ː': 'i:',    # Breathy long i to long i (12)
    'o̤': 'o',      # Breathy o to o (44)
    'o̤ː': 'o:',     # Breathy long o to o (44)
    
    # Nasalized vowels - map to oral counterparts
    'ãː': 'a:',    # Nasalized long a to long a (30)
    'ẽ': 'e',      # Nasalized e to e (42)
    'ɪ̃': 'ɪ',      # Nasalized short i to short i (31)
    'ỹ': 'y',     # Nasalized y to long y (39)
    'õː': 'o:',     # Nasalized long o to o (44)
    'æ̃': 'æ',      # Nasalized ae to ae (32)
    'ʌ̃': 'ʌ',      # Nasalized wedge to schwa (18)
    'ə̃': 'ə',      # Nasalized schwa to schwa (18)
    'ã': 'a',     # Nasalized a to long a (30)
    'ĩ': 'i:',     # Nasalized i to long i (12)
    'ĩː': 'i:',    # Nasalized long i to long i (12)
    'ũː': 'u:',    # Nasalized long u to long u (5)
    
    # Affricates - map to primary component
    't͡s': 'ts',     # ts to t (4)
    't͡ʃ': 'tʃ',     # tsh to sh (1)
    'd͡ʒ': 'dʒ',     # dzh to zh (2)
    't͡ɬ': 't',     # tl to t (4)
    
    # Ejectives - map to non-ejective counterparts
    'tʼ': 't',     # Ejective t to t (4)
    'kʼ': 'k',     # Ejective k to k (27)
    'qʼ': 'q',     # Ejective q to k (27)
    'pʼ': 'p',     # Ejective p to p (25)
    'sʼ': 's',     # Ejective s to s (21)
    
    # Additional vowels
    'ʏ': 'ɪ',      # Near-close near-front rounded to short i (31)
    'ʏː': 'y',    # Long near-close near-front rounded to long y (39)
    'ʊː': 'ʊ',     # Long near-close near-back rounded to short u (22)
    'ɤ': 'ə',      # Close-mid back unrounded to schwa (18)
    'ɤː': 'ə',     # Long close-mid back unrounded to schwa (18)
    'œː': 'ø',    # Long open-mid front rounded to long oe (40)
    'ɯː': 'u:',    # Long close back unrounded to long u (5)
    'ɛ̤': 'ɛ',      # Breathy open-mid front unrounded to epsilon (6)
    
    # Short/reduced vowels
    'ĕ': 'e',      # Short e to e (42)
    'ă': 'a',     # Short a to long a (30)
    'ĭ': 'ɪ',      # Short i to short i (31)
    'ŏ': 'o',      # Short o to o (44)
    'ŭ': 'ʊ',      # Short u to short u (22)
    
    # Laryngealized/creaky vowels - map to regular vowels
    'ḛ': 'e',      # Creaky e to e (42)
    'ḭ': 'i',      # Creaky i to i (41)
    'o̰': 'o',      # Creaky o to o (44)
    'ɛ̰': 'ɛ',      # Creaky epsilon to epsilon (6)
    'a̰': 'a',     # Creaky a to long a (30)
    'ʊ̰': 'ʊ',      # Creaky upsilon to upsilon (22)
    
    # Additional consonants
    'ɦ': 'h',      # Voiced h to h (37)
    'ʍ': 'w',      # Voiceless w to w (47)
    'ɢ': 'g',      # Uvular g to g (10)
    'ɱ': 'm',      # Labiodental nasal to m (28)
    'ʔ': 'noise',  # Glottal stop to noise (50)
    'ɮ': 'z',      # Voiced lateral fricative to z (36)
    'ɸ': 'f',      # Bilabial fricative to f (20)
    
    # Co-articulated stops 
    'k͡p': 'k',    # was 'noise', map to velar stop as it's typically more salient
    'ɡ͡b': 'g',    # was 'noise', map to velar stop (voiced counterpart)
    'p͡t': 'p',    # was 'noise', map to first stop in sequence
    'b͡d': 'b',    # was 'noise', map to first stop in sequence
    
    # Lengthened consonants
    'ʔː': 'q',    # was 'noise', map to closest glottal/uvular stop in inventory
    'hː': 'h',    # was 'noise', map to plain glottal fricative

    'æ̆': 'æ',      # Short ae to ae (32)
    'ɜ̆': 'ə',     # Short epsilon to long epsilon (33)
    'ɔ̆': 'ʌ',     # Short open-o to long open-o (3)
    'ə̠': 'ʌ',       # Retracted schwa (when it appears in stressed positions)
    'ə̆': 'ə',      # Short schwa to schwa (18)
    'ɒː': 'a:',    # Long open-o to long open-o (3)
    
    # Aspirated and modified affricates
    'd͡ʒʰ': 'dʒ',    # Aspirated dzh to zh (2)
    't͡sʼ': 'ts',    # Ejective ts to t (4)
    't͡ʃʼ': 'tʃ',    # Ejective tsh to sh (1)
    't͡ɬʼ': 't',    # Ejective tl to t (4)
    't͡ʃʲ': 'tʃ',    # Palatalized tsh to sh (1)
    'd͡ʒʲ': 'dʒ',    # Palatalized dzh to zh (2)
    
    # Voiceless sonorants
    'e̥': 'e',      # Voiceless e to e (42)
    'ɲ̥': 'ɲ',      # Voiceless ny to ny (38)
    'm̥': 'm',      # Voiceless m to m (28)
    'n̥': 'n',      # Voiceless n to n (11)
    'l̥': 'l',      # Voiceless l to l (16)
    'r̥': 'ɹ',      # Voiceless r to r (17)
    'ŋ̥': 'ŋ',      # Voiceless ng to ng (34)
    'i̥': 'i',      # Voiceless i to i (41)
    'u̥': 'u:',     # Voiceless u to long u (5)
    'ʎ̥': 'l',      # Voiceless palatal l to l (16)

    # Long consonants
    'tʰː': 't',    # Long aspirated t to t (4)
    'çː': 'ç',     # Long palatal fricative to h (37)
    'xː': 'h',     # Long x to h (37)
    'ɟː': 'ʒ',     # Long palatal stop to zh (2)
    'l̪ː': 'l',     # Long dental l to l (16)
    'pʰː': 'p',    # Long aspirated p to p (25)
    'θː': 'θ',     # Long th to th (46)
    'ɲː': 'ɲ',     # Long ny to ny (38)
    'wː': 'w',     # Long w to w (47)

    # Modified velars
    'kʰʲ': 'k',    # Palatalized aspirated k to k (27)
    'kʼʲ': 'k',    # Palatalized ejective k to k (27)
    'qʰʷ': 'q',    # Labialized aspirated q to k (27)
    'kʰʷ': 'k',    # Labialized aspirated k to k (27)
    'kʷʰ': 'k',    # Labialized aspirated k to k (27)
    'kʷʼ': 'k',    # Labialized ejective k to k (27)
    'qʷ': 'q',     # Labialized q to k (27)
    'qʷʼ': 'q',    # Labialized ejective q to k (27)
    'qʰ': 'q',     # Aspirated q to k (27)
    'q̠': 'q',      # Retracted q to k (27)
    'ɢʲ': 'g',     # Palatalized uvular g to g (10)
    'ɡʷ': 'g',     # Labialized g to g (10)

    # Rhotic vowels
    'e˞': 'ɚ',     # Rhotacized e to schwar (14)
    'a˞': 'ɚ',     # Rhotacized a to schwar (14)
    'o˞': 'ɚ',     # Rhotacized o to schwar (14)
    'u˞': 'ɚ',     # Rhotacized u to schwar (14)
    'i˞': 'ɚ',     # Rhotacized i to schwar (14)

    # Nasalized variants
    'ɛ̃ː': 'ɛ',     # Long nasalized epsilon to epsilon (6)
    'ʊ̃': 'ʊ',      # Nasalized upsilon to upsilon (22)
    'z̃': 'z',      # Nasalized z to z (36)
    'j̃': 'j',      # Nasalized j to j (29)
    'w̃': 'w',      # Nasalized w to w (47)
    'ʊ̰̃': 'ʊ',      # Creaky nasalized upsilon to upsilon (22)
    'æ̃ː': 'æ',     # Long nasalized ae to ae (32)
    'ɔ̃ː': 'ɔ',    # Long nasalized open-o to long open-o (3)
    'ɛ̰̃': 'ɛ',      # Creaky nasalized epsilon to epsilon (6)

    # Modified dentals/alveolars
    'd̪ʰ': 'd',     # Aspirated dental d to d (9)
    't̪ʰ': 't',     # Aspirated dental t to t (4)
    't̪ʲ': 'tʲ',     # Palatalized dental t to t (4)
    'tʲʰ': 'tʲ',     # Palatalized aspirated t to t (4)
    'dʰ': 'd',     # Aspirated d to d (9)
    'ðʲ': 'ð',     # Palatalized eth to eth (35)
    'zʲ': 'z',     # Palatalized z to z (36)
    'zʷ': 'z',     # Labialized z to z (36)
    
    # Complex modifications
    'ʃʷ': 'ʃ',     # Labialized sh to sh (1)
    'ɕʷ': 'ɕ',     # Labialized alveolo-palatal to sh (1)
    'ʑʷ': 'ʒ',     # Labialized voiced alveolo-palatal to zh (2)
    'ʕʷ': 'h',     # Labialized pharyngeal to h (37)
    'ħʷ': 'h',     # Labialized voiceless pharyngeal to h (37)
    'ʁʷ': 'ɹ',     # Labialized uvular to r (17)
    'χʲ': 'h',     # Palatalized x to h (37)
    'hʲ': 'h',     # Palatalized h to h (37)

    # Retracted/advanced variants
    'ɨ̠': 'ɨ',      # Retracted barred-i to barred-i (45)
    'ʊ̠': 'ʊ',      # Retracted upsilon to upsilon (22)
    'ʊ̟': 'ʊ',      # Advanced upsilon to upsilon (22)
    'æ̟': 'æ',      # Advanced ae to ae (32)
    'ə̟': 'ə',      # Advanced schwa to schwa (18)
    
    # Dental variants
    'n̪': 'n',      # Dental n to n (11)
    'l̪': 'l',      # Dental l to l (16)
    
    # Special vowels
    'ö': 'ø',     # O-umlaut to long oe (40)
    'ü': 'y',     # U-umlaut to long y (39)
    'ʉ': 'ɨ',     # Central u to long u (5)
    'ɞ': 'ə',      # Open-mid central rounded to schwa (18)
    'ɤ̈': 'ə',      # Advanced close-mid back unrounded to schwa (18)
    'ɯ̈': 'ɨ',      # Advanced high back unrounded
    
    # Implosives/ejectives/glottalized
    'ɗ': 'd',      # Implosive d to d (9)
    'ɓ': 'b',      # Implosive b to b (26)
    'ʄ': 'ʒ',      # Implosive palatal to zh (2)
    'dˀ': 'd',     # Glottalized d to d (9)
    'bˀ': 'b',     # Glottalized b to b (26)
    'ˀa': 'a',    # Preglottalized a to long a (30)

    # Modified retroflexes
    'ʈʰ': 't',     # Aspirated retroflex t to t (4)
    'ɖʰ': 'd',     # Aspirated retroflex d to d (9)

    # Remaining special cases
    'ɥ': 'j',      # Labial-palatal approximant to j (29)
    'ʀ': 'ɹ',      # Uvular trill to r (17)
    'ɹ̝': 'ɹ',      # Raised r to r (17)
    'ṽ': 'v',      # Nasalized v to v (15)
    'ə̥': 'ə',      # Voiceless schwa to schwa (18)
    'ə̯': 'ə',      # Non-syllabic schwa to schwa (18)
    'i̯': 'i',      # Non-syllabic i to i (41)
    'l̴': 'l',      # Velarized l to l (16)
    'dⁿ': 'd',     # Prenasalized d to d (9)
    'tⁿ': 't',     # Prenasalized t to t (4)
    
    # Breathy/creaky variants
    'd̪̤': 'd',     # Breathy dental d to d (9)
    'ɑ̤': 'a',     # Breathy long a to long a (13)
    'ṳː': 'u:',     # Breathy long u to long u (5)
    'ṳ': 'u:',      # Breathy u to long u (5)
    'ɯ̤': 'u:',     # Breathy unrounded u to long u (5)
    'ɪ̰': 'ɪ',      # Creaky short i to short i (31)
    'ɔ̰': 'ɔ',     # Creaky open-o to long open-o (3)
    'ɔ̤': 'ɔ',     # Breathy open-o to long open-o (3)
    
    # Height/backness variants
    'ɑ̝': 'a',     # Raised long a to long a (13)
    'ɛ̞': 'ɛ',      # Lowered epsilon to epsilon (6)
    'ɛ̝': 'ɛ',      # Raised epsilon to epsilon (6)
    'e̝': 'e',      # Raised e to e (42)
    'o̝': 'o',      # Raised o to o (44)
    'u̝': 'u:',     # Raised u to long u (5)
    'ɑ̞': 'a',     # Lowered long a to long a (13)
    'a̘': 'a',     # Advanced tongue root a to long a (30)
    'ä': 'a',     # Centralized a to long a (30)
    
    # Modified vowel quality
    'ɛ̈': 'ɛ',      # Centralized epsilon to epsilon (6)
    'œ̈': 'ø',     # Centralized oe to long oe (40)
    'ʌ̈': 'ʌ',      # Centralized wedge to schwa (18)
    'ɛ̠': 'ɛ',      # Retracted epsilon to epsilon (6)
    'a̠': 'a',     # Retracted a to long a (30)
    'o̠': 'o',      # Retracted o to o (44)
    'i̠': 'i',      # Retracted i to i (41)
    
    # Remaining consonant variants
    't̠': 't',      # Retracted t to t (4)
    'd̠': 'd',      # Retracted d to d (9)
    'n̠': 'n',      # Retracted n to n (11)
    't̟': 't',      # Advanced t to t (4)
    'r̟': 'ɹ',      # Advanced r to r (17)
    'r̠': 'ɹ',      # Retracted r to r (17)
    'rˠ': 'ɹ',     # Velarized r to r (17)
    'ɪ̥': 'ɪ',      # Voiceless short i to short i (31)
    'ʔʷ': 'noise', # Labialized glottal stop to noise (50)
    'ɕʼ': 'ɕ',     # Ejective alveolo-palatal to sh (1)
    'cʼ': 'k',     # Ejective c to k (27)
    'cʷʰ': 'k',    # Labialized aspirated c to k (27)
    'w̝': 'w',      # Raised w to w (47)

    'ʃ̠': 'ʃ',      # Retracted sh to sh (1)
    'ɪ̰̃': 'ɪ',      # Creaky nasalized short i to short i (31)
    'tʷʼ': 't',    # Labialized ejective t to t (4)
    'ŋʲ': 'ŋ',     # Palatalized ng to ng (34)
    'bʰ': 'b',     # Aspirated b to b (26)
    'æ̈': 'æ',      # Centralized ae to ae (32)
    'ɘ': 'ə',       # Close-mid central unrounded vowel to schwa (18)
    'tsʰ': 'ts',    # Aspirated ts to ts (4)
    'r̩ː': 'ɚ',     # Long rhotic schwa to schwar (14)
}


def get_compound_phoneme_mapping(phoneme):
    # First try direct mapping
    if phoneme in phoneme_mapping:
        return phoneme_mapping[phoneme]
    
    # For compound phonemes, map components and combine
    mapped = ""
    remaining = phoneme
    while remaining:
        found = False
        # Try to match longest possible substring first
        for i in range(len(remaining), 0, -1):
            subset = remaining[:i]
            if subset in phoneme_mapping:
                mapped += phoneme_mapping[subset]
                remaining = remaining[i:]
                found = True
                break
        if not found:
            # If no mapping found for current character, treat as noise
            remaining = remaining[1:]
    
    return mapped if mapped else "noise"


def create_normalized_mapping(mapping_dict):
        
    # Create normalized version of the mapping
    from unicodedata import normalize
    """Create a mapping dictionary with normalized Unicode characters."""
    return {
        normalize('NFC', key): normalize('NFC', value)
        for key, value in mapping_dict.items()
    }


phoneme_mapper = create_normalized_mapping(phoneme_mapping) #Both the 'key' and value ar normalized

#print(phoneme_mapper)


def analyze_phoneme_merger(phoneme_mapper):
    # Check for circular references
    def check_circular_refs(mapper):
        issues = []
        for phoneme, target in mapper.items():
            if target in mapper and mapper[target] != target:
                issues.append(f"Potential circular reference: {phoneme} -> {target} -> {mapper[target]}")
        return issues

    # Check for consistency in vowel merging
    def check_vowel_consistency(mapper):
        issues = []
        # Common vowel pairs that should merge consistently
        vowel_pairs = [
            ('ɑː', 'ɑːɹ'),  # Long a with/without r
            ('ɔː', 'ɔːɹ'),  # Long o with/without r
            ('iː', 'iə'),   # Long i and i-schwa
            ('ʊ', 'ʊɹ'),    # Short u with/without r
        ]
        
        for v1, v2 in vowel_pairs:
            if v1 in mapper and v2 in mapper:
                if mapper[v1] != mapper[v2]:
                    issues.append(f"Inconsistent vowel mapping: {v1} -> {mapper[v1]} but {v2} -> {mapper[v2]}")
        return issues

    # Check for r-colored vowel consistency
    def check_r_colored_consistency(mapper):
        issues = []
        r_colored = ['ɪɹ', 'ɛɹ', 'ʊɹ']
        target = 'ɚ'  # All should map to schwa-r
        
        for phoneme in r_colored:
            if phoneme in mapper and mapper[phoneme] != target:
                issues.append(f"Inconsistent r-colored vowel: {phoneme} -> {mapper[phoneme]}, expected -> {target}")
        return issues

    # Check compound phoneme handling
    def check_compound_handling(mapper):
        issues = []
        for phoneme in mapper:
            if len(phoneme) > 1 and phoneme not in ['tʃ', 'dʒ', 'aɪ', 'eɪ', 'oʊ', 'aʊ', 'ɔɪ', 'iə', 'uː', 'iː', 'ɑː', 'ɔː', 'ɜː', 'əl']:
                if not phoneme.startswith(mapper[phoneme][0]):
                    issues.append(f"Potentially incorrect compound mapping: {phoneme} -> {mapper[phoneme]}")
        return issues

    # Collect all issues
    all_issues = []
    all_issues.extend(check_circular_refs(phoneme_mapper))
    all_issues.extend(check_vowel_consistency(phoneme_mapper))
    all_issues.extend(check_r_colored_consistency(phoneme_mapper))
    #all_issues.extend(check_compound_handling(phoneme_mapper))

    print("Testing complete vocab:")
    for kv in list(complete_vocab.keys()):
        mapped = get_compound_phoneme_mapping(kv)
        if (mapped != kv):
            if (mapped == 'noise') or (complete_vocab[kv] > 5000):
                print(f"{kv} -> {mapped} \tcount: {complete_vocab[kv]}")

    # Verify coverage
    missing_phonemes = set(complete_vocab.keys()) - set(phoneme_mapping.keys())
    
    print(f"Missing phonemes: {missing_phonemes}")
    for phoneme in missing_phonemes:
        print(f"{phoneme} -> {complete_vocab[phoneme]}")
    
    return all_issues


def create_new_index():

    # First, count the frequencies mapping to the new phonemes (count merged branches)
    phoneme_vocab_mapped_counts = {}
    for key, value in phoneme_mapper.items():
        if value not in phoneme_vocab_mapped_counts:
            phoneme_vocab_mapped_counts[value] = 0
        phoneme_vocab_mapped_counts[value] += 1
    
    print(f"Mapped: {len(phoneme_mapper)} phonemes onto {len(phoneme_vocab_mapped_counts)} phonemes")
    #print(phoneme_mapper)
    # Sort phonemes by frequency in descending order, excluding SIL
    sorted_phonemes = sorted(
        [p for p in phoneme_vocab_mapped_counts.keys() if p not in ['SIL', 'noise']],
        key=lambda x: phoneme_vocab_mapped_counts[x],
        reverse=True
    )
    
    # Create the index mapping
    phoneme_mapped_index = {}
    
    # Put SIL at index 0
    phoneme_mapped_index['SIL'] = 0
    
    # Add the rest of the phonemes with indices starting from 1
    for i, phoneme in enumerate(sorted_phonemes):
        phoneme_mapped_index[phoneme] = i + 1
        
    # Put noise at the last index
    phoneme_mapped_index['noise'] = len(sorted_phonemes) + 1
    
    print("New index created:")
    print(phoneme_mapped_index)


    print("Unique phonemes in the new index:")
    print(list(phoneme_mapped_index.keys()))
    # Run the analysis
    issues = analyze_phoneme_merger(phoneme_mapper)


    # Print findings
    print("Found the following potential issues:")
    for i, issue in enumerate(issues, 1):
        print(f"{i}. {issue}")

    # Additional validation of the phoneme_mapped_index
    mapped_phonemes = set(phoneme_mapped_index.keys())
    merger_outputs = set(p for p in phoneme_mapper.values() if not p.endswith('*'))
    missing_indices = merger_outputs - mapped_phonemes
    extra_indices = mapped_phonemes - merger_outputs

    print("\nIndex validation:")
    if missing_indices:
        print(f"Merged phonemes missing from index: {missing_indices}")
    if extra_indices:
        print(f"Extra phonemes in index: {extra_indices}")
    print("Done")

def check_missing_phonemes():


    test_phonemes = ['a', 'd͡ʒ', 'ʃʲ', 'm', 'ɜ', 'ɘ', 'ʃ', 't͡ʃʰ', 'r', 'ä', 't͡ʃ', 'ə̆', 'pʰ', 'ɜ̆', 'ʌ̈', 't', 'ʃʰ', 'kʼ', 'ʒʲ', 'ə', 'ă', 'b', 'ɨ', 'æ̈', 'j', 'ɛ̈', 'p', 'd', 'n', 'ɥ', 'ɡ', 't͡ʃʼ', 'χ', 'ˀa', 'ʒ', 'ħʷ', 'ɹ', 'ħ', 'œ̈', 'ɾ', 'ʁ', 'ɤ̈', 'z', 'i', 'χʲ', 'tʰ', 's', 'ʁʷ', 'h', 'ɛ', 'k', 'ɑ', 'x', 'ɔ', 'o', 'u', 'e', 'ɑ̃', 'ŋ', 'l', 'ʊ', 'ã', 'q̠', 'õ', 'w', 'β', 'f', 'v', 'ʎ', 'oː', 'eː', 'kʰ', 'ð', 'œ', 'ɹ̩', 'ɛ̝', 'ʔ', 'l̥', 'e̝', 'aː', 'uː', 'iː', 'ʌ̃', 'æ', 'ẽ', 'y', 'yː', 'ɪː', 'ɛː', 'øː', 'œː', 'ɑː', 'o̝', 'ʌ', 'ø', 'ɯ', 'sː', 'ɛ̃', 'c', 'ɪ', 'ɟ', 'ɲ', 'æː', 'æ̃ː', 'ʉ', 'ɫ̩', 'ʋ', 'ɫ', 'kʲ', 'ɣ', 'ɦ', 'n̩', 'ɸ', 'dʰ', 'm̩', 'h̩', 'ç', 'bʰ', 't̪', 'd̪', 'd̪̤', 'b̤', 'n̪', 'ĩ', 'ũː', 'ũ', 'j̤', 'l̪', 'pː', 'kː', 'rː', 'nː', 'l̪ː', 'bː', 'mː', 'ɞ', 't̪ʲ', 'hː', 'ʔː', 'tː', 'dː', 'ʈ', 'ɖ', 'ʂ', 'ʐ', 'r̥', 'ɔː', 'ʏː', 'ʏ', 'θ', 'n̥', 'cː', 'ɟː', 'fː', 'lː', 'ŋ̥', 'ə̯', 'ə̟', 'i̯', 'ʊ̟', 'ɛ̞', 'ʊ̠', 'r̟', 'r̠', 'ɕ', 'pʲ', 'bʲ', 'ŭ', 'tʲ', 'ĕ', 'dʲ', 'ɡʲ', 'nʲ', 'fʲ', 'zʲ', 'vʲ', 'lʲ', 'sʲ', 'xʲ', 'hʲ', 'ŏ', 'mʲ', 't͡ʃʲ', 'd͡ʒʲ', 'æ̆', 'ŋʲ', 'rʲ', 'ɾʲ', 'ĭ', 'ɔ̆', 's̪', 'ɱ', 'ɽ', 'ɳ', 'ʈʰ', 'ɖʰ', 'ɵ', 't̪ʰ', 'd͡ʒʰ', 'ɭ', 'ʊ̃', 'sʰ', 'ḭ', 'cʰ', 'ʊ̰', 'ɛ̰', 'ɪ̰', 'a̰', 'ḛ', 'o̰', 'ɛ̰̃', 'ɪ̃', 'ʊ̰̃', 'ɲ̥', 'æ̃', 'm̥', 'ɪ̰̃', 'ɔ̰', 'wː', 'ɔ̃ː', 'ɗ', 'ɔ̃', 'õː', 'ɯː', 'ə̃', 'tʰː', 'pʰː', 'vː', 'zː', 'ʃː', 'jː', 'ɲː', 'xː', 'çː', 'ɓ', 'ãː', 't͡sʼ', 'ɻ', 'ʀ', 't͡s', 'a', 'b', 'w', 'e', 'ɔ', 'p', 'ɛ', 't', 'o', 't͡ʃ', 'u', 'd', 'k', 'ɔ̃', 'kʷ', 'ɡ', 'k͡p', 'm', 'n', 'n̠', 'j', 'f', 's', 'ç', 'ɹ', 'l', 'i', 'ʍ', 'd̠', 'ʐ', 'ŋ', 'ɥ', 't̠', 'ɕʷ', 'ɕ', 'pʰ', 'tʰ', 'sʰ', 'kʰ', 'z', 'ä', 'h', 'v', 'ʃ', 'ʒ', 'r', 'ü', 'y', 'ʔ', 'ɪ', 'æ', 'ə', 'q̠', 'ɞ', 't͡ʃʰ', 'ĩ', 'ã', 'õ', 'ʋ', 'x', 'ɾ', 'ɓ', 'ɗ', 'c', 'ɟ', 'ʄ', 'aː', 'ɲ', 'ɔː', 'tʲ', 'oː', 'ɤː', 'uː', 'ʊː', 'ɳ', 'ɯː', 'ðʲ', 'tʲʰ', 'ɛ̃', 'ɣ', 'kʲ', 'ũ', 'ĩː', 'rˠ', 'ɛ̃ː', 'ãː', 'ɔ̃ː', 'ũː', 't̪', 'ʑʷ', 'ʑ', 'ɡʷ', 'ŋʷ', 'ɽ', 'o̠', 'w̃', 'ɯ', 'ö', 'ɡ͡b', 'd͡ʒ', 'ʁ', 'q', 'i̠', 'ɛ̠', 'v̩', 'l̥', 'ɤ', 'r̥', 'ɢ', 'ɢʲ', 'χ', 'kʰʲ', 'm̥', 'n̥', 'nː', 'pː', 'lː', 'rː', 'æː', 'eː', 'o˞', 'e˞', 'a˞', 'i˞', 'iː', 'u˞', 'ʕʷ', 'ʕ', 'xʷ', 'ɬ', 'qʷ', 'ɑ', 'ɪ̃', 'ẽ', 'ʊ', 'd̪', 'd͡ʒʰ', 'ɦ', 't̪ʰ', 'd̪ʰ', 'dʰ', 'bʰ', 'ʌ', 'pʼ', 'ʊ̃', 'kʼ', 'β', 'kʼʲ', 'ħ', 'qʼ', 'cʼ', 'kʰʷ', 'qʰʷ', 'ɨ', 'ð', 'ɖ', 'ɸ', 'ʏ', 'ø', 'l̩', 'dʷ', 'pʷ', 'bʷ', 'tʷ', 'ṽ', 'z̃', 'ʃʷ', 'ʒʷ', 'a̘', 't͡s', 'n̤', 'ŋ̩', 'h̩', 'ɹ̝', 'ɑː', 'ɑ̞', 'ɑ̝', 'ɛː', 'ɪː', 'u̝', 'sʲ', 'ɜ', 'ɨː', 'θ', 'l̴', 'n̩', 'j̃', 't͡ɬ', 'sʼ', 'kʷʼ', 'cʰ', 'qʷʼ', 'zʷ', 'qʰ', 'kʷʰ', 't͡ɬʼ', 'cʷʰ', 'ʁʷ', 'tʷʼ', 'a̤', 'ɔ̤', 'o̤ː', 'i̤ː', 'ṳ', 'o̤', 'ṳː', 'ɯ̤', 'tʼ', 'ɑ̃', 'ɫ', 'ɑ̤', 'ʌ̃', 'ɛ̤', 'p͡t', 'b͡d', 'mʷ', 'w̝', 'ʎ̥', 'ɮ', 'ʃ̠', 'fː', 'i̥', 'u̥', 'ɪ̥', 'zː', 'sː', 'ʎ', 'ə̥', 'ʃː', 'e̥', 'ỹ', 'ɯ̈', 'ʉ', 'ɒ', 'xː', 'l̪', 'n̪', 'θː', 'ɒː', 'dˀ', 'bˀ', 't̟', 'æ̟', 'dⁿ', 'ɨ̠', 'tⁿ', 'a̠', 't͡sʰ', 'ɕʰ', 'm̩', 'ɭ', 'ə̃', 'ɕʼ', 't͡ʃʼ', 'ʔʷ', 'tsʰ'] # from UCLA phonetics, some repeated
    missing_phonemes = set(test_phonemes) - set(phoneme_mapper.keys())
    print(f"Missing phonemes: {missing_phonemes}")
    print(len(missing_phonemes))

    # list of phonemes that map to noise:
    noise_phonemes = [k for k, v in phoneme_mapper.items() if v == 'noise']
    noise_phonemes_in_test_set = set(noise_phonemes) & set(test_phonemes)
    print(f"Noise phonemes in test set: {noise_phonemes_in_test_set}")
    # only  {'ʔ', 'ʔʷ'} are mapped to noise from ucla dataset

def check_duplicates():
    from collections import defaultdict

    # Create a dictionary to store the key-value pairs
    key_value_pairs = defaultdict(set)

    # Populate the key-value pairs
    for key, value in phoneme_mapper.items():
        key_value_pairs[key].add(value)

    # Find and print keys with multiple different values
    duplicates = {key: values for key, values in key_value_pairs.items() if len(values) > 1}

    print("Duplicate keys with different values:", len(duplicates))
    for key, values in duplicates.items():
        print(f"Key '{key}' has different values: {values}")


def make_phoneme_groups():
    
        
    phoneme_groups_19 = {
        # Vowels - Separated by height and frontness
        "high_front_vowels": ["i", "i:", "ɪ", "y", "ʏ", "iː"],
        "high_back_vowels": ["u", "u:", "ʊ", "ɯ", "ʉ", "ɨ", "uː"],
        "mid_front_vowels": ["e", "e:", "ɛ", "ø", "œ", "eː"],
        "mid_central_vowels": ["ə", "ɜ", "ɜ:", "ɚ", "ʌ", "ɘ", "ɵ"],
        "mid_back_vowels": ["o", "o:", "ɔ", "ɔ:", "ɤ", "oː"],
        "low_vowels": ["a", "a:", "æ", "ɐ", "ɑ", "ɑ:", "ɒ", "aː"],
        "diphthongs": ["aɪ", "eɪ", "ɔɪ", "aʊ", "oʊ", "ɛə", "ɪə", "ʊə"],
        
        # Consonants - Organized by manner and voicing
        "voiceless_stops": ["p", "t", "k", "q", "ʔ", "ʈ", "c"],
        "voiced_stops": ["b", "d", "g", "ɢ", "ɖ", "ɟ"],
        "voiceless_fricatives": ["f", "θ", "s", "ʃ", "ç", "x", "h", "ħ", "ʂ", "ɕ", "χ"],
        "voiced_fricatives": ["v", "ð", "z", "ʒ", "ʝ", "ɣ", "ʕ", "ʐ", "ʑ", "ʁ"],
        "voiceless_affricates": ["ts", "tʃ", "tɕ", "ʈʂ"],
        "voiced_affricates": ["dz", "dʒ", "dʑ", "ɖʐ"],
        "nasals": ["m", "n", "ɲ", "ŋ", "ɴ", "ɱ", "ɳ"],
        
        # Liquids, glides, and palatalized sounds
        "laterals": ["l", "ɭ", "ʎ", "ʟ"],
        "rhotics": ["r", "ɾ", "ɹ", "ʀ", "ɽ", "ɻ"],
        "glides": ["j", "w", "ɥ", "ɰ"],
        "palatalized": ["ɭʲ", "rʲ", "tʲ", "nʲ"],
        
        "SIL": ["SIL"],
        "noise": ["noise"],
    }

    phoneme_groups = {
        # Vowels - Adjusted based on confusion patterns
        "front_vowels": ["i", "i:", "ɪ", "y", "ʏ", "iː", "e", "e:", "ɛ", "ø", "œ", "eː"],  # Merged high/mid front
        "central_vowels": ["ə", "ɜ", "ɜ:", "ɚ", "ʌ", "ɘ", "ɵ"],  # Keep central vowels separate
        "back_vowels": ["u", "u:", "ʊ", "ɯ", "ʉ", "ɨ", "uː", "o", "o:", "ɔ", "ɔ:", "ɤ", "oː"],  # Merged high/mid back
        "low_vowels": ["a", "a:", "æ", "ɐ", "ɑ", "ɑ:", "ɒ", "aː"],  # Keep low vowels separate
        "diphthongs": ["aɪ", "eɪ", "ɔɪ", "aʊ", "oʊ", "ɛə", "ɪə", "ʊə"],  # Keep diphthongs separate
        
        # Consonants - Maintain voicing distinction for stops and fricatives
        "voiceless_stops": ["p", "t", "k", "q", "ʔ", "ʈ", "c", "tʲ"],  # Add palatalized t
        "voiced_stops": ["b", "d", "g", "ɢ", "ɖ", "ɟ"],
        "voiceless_fricatives": ["f", "θ", "s", "ʃ", "ç", "x", "h", "ħ", "ʂ", "ɕ", "χ"],
        "voiced_fricatives": ["v", "ð", "z", "ʒ", "ʝ", "ɣ", "ʕ", "ʐ", "ʑ", "ʁ"],
        
        # Keep affricates distinction by voicing
        "voiceless_affricates": ["ts", "tʃ", "tɕ", "ʈʂ"],
        "voiced_affricates": ["dz", "dʒ", "dʑ", "ɖʐ"],
        
        # Merge palatalized nasals with base nasals
        "nasals": ["m", "n", "nʲ", "ɲ", "ŋ", "ɴ", "ɱ", "ɳ"],
        
        # Merge palatalized laterals with base laterals
        "laterals": ["l", "ɭ", "ɭʲ", "ʎ", "ʟ"],
        
        # Merge palatalized rhotics with base rhotics
        "rhotics": ["r", "rʲ", "ɾ", "ɹ", "ʀ", "ɽ", "ɻ"],
        
        # Keep glides separate
        "glides": ["j", "w", "ɥ", "ɰ"],
        
        # Special tokens
        "SIL": ["SIL"],
        "noise": ["noise"],
    }
    
    # verify groups cover all phonemes
    phoneme_groups_flat = [p for g in phoneme_groups for p in phoneme_groups[g]]
    extra_phonemes = set(phoneme_groups_flat)- set(phoneme_mapped_index.keys())
    print(f"extra phonemes: {extra_phonemes}")
    missing_phonemes = set(phoneme_mapped_index.keys()) - set(phoneme_groups_flat)
    print(f"missing phonemes: {missing_phonemes}")
    assert len(missing_phonemes) == 0, "Phoneme groups do not cover all phonemes"

    # remove extra phonemes:
    for p in extra_phonemes:
        for g in phoneme_groups:
            if p in phoneme_groups[g]:
                phoneme_groups[g].remove(p)


    # covert groups to index
    phoneme_groups_based = {}
    for g in phoneme_groups:
        phoneme_groups_based[g] = [phoneme_mapped_index[p] for p in phoneme_groups[g]]

    # verify groups are correctly mapped
    for g in phoneme_groups:
        for p in phoneme_groups[g]:
            assert phoneme_mapped_index[p] in phoneme_groups_based[g], f"{p} not in {g}"


    global phoneme_groups_index
    # clear
    phoneme_groups_index = {}
    phoneme_groups_index = { "SIL": 0,}
    for i, g in enumerate(phoneme_groups):
        if (g != "SIL") and (g != "noise"):
            phoneme_groups_index[g] = i+1
    phoneme_groups_index["noise"] = len(phoneme_groups_index)
    print("phoneme_groups_index:", phoneme_groups_index)
    print("total groups (excluding noise)", len(phoneme_groups_index)-1)
    

    # base phonemes index to group index
    base66_to_groups = {}
    for p in phoneme_mapped_index:
        for g in phoneme_groups:
            if p in phoneme_groups[g]:
                base66_to_groups[phoneme_mapped_index[p]] = phoneme_groups_index[g]


    # verify all phonemes are mapped to a group
    assert len(base66_to_groups) == len(phoneme_mapped_index), "Not all phonemes are mapped to a group"
    print("base66_to_groups:", base66_to_groups)


#main

if __name__ == "__main__":
    # Create the new index
    #create_new_index()
    
    #check_missing_phonemes()
    #check_duplicates()
    make_phoneme_groups()