Piyazon commited on
Commit
aae642a
·
1 Parent(s): df12df5
Files changed (3) hide show
  1. Dockerfile +34 -0
  2. app.py +7 -0
  3. utils.py +338 -0
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.12-slim
5
+
6
+ # Install system dependencies for torch and audio
7
+ RUN apt-get update && apt-get install -y \
8
+ ffmpeg \
9
+ git \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # --- Settings you can override in the Space "Build arguments" ---
13
+ ARG REPO_URL="https://github.com/vibevoice-community/VibeVoice.git"
14
+ ARG REPO_REF="main" # or a tag/commit for reproducibility
15
+
16
+ # Clone the source into /app
17
+ RUN git clone --depth 1 --branch "${REPO_REF}" "${REPO_URL}" /app
18
+
19
+ WORKDIR /app
20
+
21
+ # Editable install
22
+ RUN pip install --no-cache-dir -e .
23
+
24
+ RUN useradd -m -u 1000 user
25
+ USER user
26
+ ENV PATH="/home/user/.local/bin:$PATH"
27
+
28
+ COPY --chown=user ./requirements.txt requirements.txt
29
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
30
+
31
+ COPY --chown=user . /app
32
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
33
+
34
+
app.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+
3
+ app = FastAPI()
4
+
5
+ @app.get("/")
6
+ def greet_json():
7
+ return {"Hello": "World!"}
utils.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Uyghur Text Processing Utilities
3
+ Contains functions for processing Uyghur text, numbers, and script conversion.
4
+ """
5
+
6
+ import unicodedata
7
+ from pypinyin import pinyin, Style
8
+ import re
9
+ from umsc import UgMultiScriptConverter
10
+
11
+ # Initialize uyghur script converter
12
+ ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
13
+ ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
14
+
15
+
16
+ def number_to_uyghur_arabic_script(number_str):
17
+ """
18
+ Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal)
19
+ to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term.
20
+ Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits.
21
+
22
+ Args:
23
+ number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789').
24
+
25
+ Returns:
26
+ str: Uyghur pronunciation in Arabic script.
27
+ """
28
+ # Uyghur number words in Arabic script
29
+ digits = {
30
+ 0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش',
31
+ 6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز'
32
+ }
33
+ ordinals = {
34
+ 1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى',
35
+ 6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'توققۇزىنجى'
36
+ }
37
+ tens = {
38
+ 10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك',
39
+ 60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان'
40
+ }
41
+ units = [
42
+ (1000000000, 'مىليارد'), # billion
43
+ (1000000, 'مىليون'), # million
44
+ (1000, 'مىڭ'), # thousand
45
+ (100, 'يۈز') # hundred
46
+ ]
47
+ fractions = {
48
+ 1: 'ئوندا', # tenths
49
+ 2: 'يۈزدە', # hundredths
50
+ 3: 'مىڭدە', # thousandths
51
+ 4: 'ئون مىڭدە', # ten-thousandths
52
+ 5: 'يۈز مىڭدە', # hundred-thousandths
53
+ 6: 'مىليوندا', # millionths
54
+ 7: 'ئون مىليوندا', # ten-millionths
55
+ 8: 'يۈز مىليوندا', # hundred-millionths
56
+ 9: 'مىليارددا' # billionths
57
+ }
58
+
59
+ # Convert integer part to words
60
+ def integer_to_words(num):
61
+ if num == 0:
62
+ return digits[0]
63
+
64
+ result = []
65
+ num = int(num)
66
+
67
+ # Handle large units (billion, million, thousand, hundred)
68
+ for value, unit_name in units:
69
+ if num >= value:
70
+ count = num // value
71
+ if count == 1 and value >= 100: # e.g., 100 → "يۈز", not "بىر يۈز"
72
+ result.append(unit_name)
73
+ else:
74
+ result.append(integer_to_words(count) + ' ' + unit_name)
75
+ num %= value
76
+
77
+ # Handle tens and ones
78
+ if num >= 10 and num in tens:
79
+ result.append(tens[num])
80
+ elif num > 10:
81
+ ten = (num // 10) * 10
82
+ one = num % 10
83
+ if one == 0:
84
+ result.append(tens[ten])
85
+ else:
86
+ result.append(tens[ten] + ' ' + digits[one])
87
+ elif num > 0:
88
+ result.append(digits[num])
89
+
90
+ return ' '.join(result)
91
+
92
+ # Clean the input (remove commas or spaces)
93
+ number_str = number_str.replace(',', '').replace(' ', '')
94
+
95
+ # Check for ordinal (ends with '_')
96
+ is_ordinal = number_str.endswith('_') or number_str.endswith('-')
97
+ if is_ordinal:
98
+ number_str = number_str[:-1] # Remove the _ sign
99
+ num = int(number_str)
100
+ if num > 999999999:
101
+ return number_str
102
+ if num in ordinals: # Use special forms for single-digit ordinals
103
+ return ordinals[num]
104
+
105
+ # Convert to words and modify the last word for ordinal
106
+ words = integer_to_words(num).split()
107
+ last_num = num % 100 # Get the last two digits to handle tens and ones
108
+ if last_num in tens:
109
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
110
+ elif last_num % 10 == 0 and last_num > 0:
111
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
112
+ else:
113
+ last_digit = num % 10
114
+ if last_digit in ordinals:
115
+ # Replace last digit with ordinal form
116
+ words[-1] = ordinals[last_digit] + ' '
117
+ elif last_digit == 0:
118
+ words[-1] += 'ىنجى'
119
+ return ' '.join(words)
120
+
121
+ # Check for percentage
122
+ is_percentage = number_str.endswith('%')
123
+ if is_percentage:
124
+ number_str = number_str[:-1] # Remove the % sign
125
+
126
+ # Check for fraction
127
+ if '/' in number_str:
128
+ numerator, denominator = map(int, number_str.split('/'))
129
+ if numerator in digits and denominator in digits:
130
+ return f"{digits[denominator]}دە {digits[numerator]}"
131
+ else:
132
+ return number_str
133
+
134
+ # Split into integer and decimal parts
135
+ parts = number_str.split('.')
136
+ integer_part = parts[0]
137
+ decimal_part = parts[1] if len(parts) > 1 else None
138
+
139
+ # Validate integer part (up to 9 digits)
140
+ if len(integer_part) > 9:
141
+ return number_str
142
+
143
+ # Validate decimal part (up to 9 digits)
144
+ if decimal_part and len(decimal_part) > 9:
145
+ return number_str
146
+
147
+ # Convert the integer part
148
+ pronunciation = integer_to_words(int(integer_part))
149
+
150
+ # Handle decimal part as a whole number with fractional term
151
+ if decimal_part:
152
+ pronunciation += ' پۈتۈن'
153
+ if decimal_part != '0': # Only pronounce non-zero decimal parts
154
+ # Remove trailing zeros
155
+ decimal_value = int(decimal_part.rstrip('0'))
156
+ # Count significant decimal places
157
+ decimal_places = len(decimal_part.rstrip('0'))
158
+ # Fallback for beyond 9 digits
159
+ fraction_term = fractions.get(decimal_places, 'مىليارددا')
160
+ pronunciation += ' ' + fraction_term + \
161
+ ' ' + integer_to_words(decimal_value)
162
+
163
+ # Append percentage term if applicable
164
+ if is_percentage:
165
+ pronunciation += ' پىرسەنت'
166
+
167
+ return pronunciation.strip()
168
+
169
+
170
+ def process_uyghur_text_with_numbers(text):
171
+ """
172
+ Processes a string containing Uyghur text and numbers, converting valid numbers to their
173
+ Uyghur pronunciation in Arabic script while preserving non-numeric text.
174
+
175
+ Args:
176
+ text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك').
177
+
178
+ Returns:
179
+ str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved.
180
+ """
181
+ text = text.replace('%', ' پىرسەنت ')
182
+ # Valid number characters and symbols
183
+ digits = '0123456789'
184
+ number_symbols = '/.%_-'
185
+
186
+ result = []
187
+ i = 0
188
+ while i < len(text):
189
+ # Check for spaces and preserve them
190
+ if text[i].isspace():
191
+ result.append(text[i])
192
+ i += 1
193
+ continue
194
+
195
+ # Try to identify a number (fraction, percentage, ordinal, decimal, or integer)
196
+ number_start = i
197
+ number_str = ''
198
+ is_number = False
199
+
200
+ # Collect potential number characters
201
+ while i < len(text) and (text[i] in digits or text[i] in number_symbols):
202
+ number_str += text[i]
203
+ i += 1
204
+ is_number = True
205
+
206
+ # If we found a potential number, validate and convert it
207
+ if is_number:
208
+ # Check if the string is a valid number format
209
+ valid = False
210
+ if '/' in number_str and number_str.count('/') == 1:
211
+ # Fraction: e.g., "1/4"
212
+ num, denom = number_str.split('/')
213
+ if num.isdigit() and denom.isdigit():
214
+ valid = True
215
+ elif number_str.endswith('%'):
216
+ # Percentage: e.g., "25%"
217
+ if number_str[:-1].isdigit():
218
+ valid = True
219
+ elif number_str.endswith('_') or number_str.endswith('-'):
220
+ # Ordinal: e.g., "1_"
221
+ if number_str[:-1].isdigit():
222
+ valid = True
223
+ elif '.' in number_str and number_str.count('.') == 1:
224
+ # Decimal: e.g., "3.14"
225
+ whole, frac = number_str.split('.')
226
+ if whole.isdigit() and frac.isdigit():
227
+ valid = True
228
+ elif number_str.isdigit():
229
+ # Integer: e.g., "123"
230
+ valid = True
231
+
232
+ if valid:
233
+ try:
234
+ # Convert the number to Uyghur pronunciation
235
+ converted = number_to_uyghur_arabic_script(number_str)
236
+ result.append(converted)
237
+ except ValueError:
238
+ # If conversion fails, append the original number string
239
+ result.append(number_str)
240
+ else:
241
+ # If not a valid number format, treat as regular text
242
+ result.append(number_str)
243
+ else:
244
+ # Non-number character, append as is
245
+ result.append(text[i])
246
+ i += 1
247
+
248
+ # Join the result list into a string
249
+ return ''.join(result)
250
+
251
+
252
+ def fix_pauctuations(batch):
253
+ """
254
+ Normalize and clean Uyghur text by fixing punctuation and character variants.
255
+
256
+ Args:
257
+ batch (str): Input text to be normalized.
258
+
259
+ Returns:
260
+ str: Normalized text with only valid Uyghur characters.
261
+ """
262
+ batch = batch.lower()
263
+ batch = unicodedata.normalize('NFKC', batch)
264
+
265
+ # Replace Uyghur character variants
266
+ batch = batch.replace('ژ', 'ج')
267
+ batch = batch.replace('ک', 'ك')
268
+ batch = batch.replace('ی', 'ى')
269
+ batch = batch.replace('ه', 'ە')
270
+
271
+ vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك",
272
+ "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"]
273
+
274
+ # Process each character in the batch
275
+ result = []
276
+ for char in batch:
277
+ if char in vocab:
278
+ result.append(char)
279
+ elif char in {'.', '?', '؟'}:
280
+ result.append(' ') # Replace dot with two spaces
281
+ else:
282
+ # Replace other non-vocab characters with one space
283
+ result.append(' ')
284
+
285
+ # Join the result into a string
286
+ return ''.join(result)
287
+
288
+
289
+ def chinese_to_pinyin(mixed_text):
290
+ """
291
+ Convert Chinese characters in a mixed-language string to Pinyin without tone marks,
292
+ preserving non-Chinese text, using only English letters.
293
+
294
+ Args:
295
+ mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur)
296
+
297
+ Returns:
298
+ str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged
299
+ """
300
+ # Regular expression to match Chinese characters (Unicode range for CJK Unified Ideographs)
301
+ chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
302
+
303
+ def replace_chinese(match):
304
+ chinese_text = match.group(0)
305
+ # Convert Chinese to Pinyin without tone marks, join syllables with spaces
306
+ pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
307
+ return ' '.join([item[0] for item in pinyin_list])
308
+
309
+ # Replace Chinese characters with their Pinyin, leave other text unchanged
310
+ result = chinese_pattern.sub(replace_chinese, mixed_text)
311
+ return result
312
+
313
+
314
+ def preprocess_uyghur_text(text):
315
+ """
316
+ Complete preprocessing pipeline for Uyghur text.
317
+ Converts Chinese to Pinyin, Latin script to Arabic script, processes numbers, and fixes punctuation.
318
+
319
+ Args:
320
+ text (str): Input text in any supported format.
321
+
322
+ Returns:
323
+ str: Fully preprocessed Uyghur text in Arabic script.
324
+ """
325
+ # Step 1: Convert Chinese to Pinyin
326
+ text = chinese_to_pinyin(text)
327
+
328
+ # Step 2: Convert Latin script to Arabic script
329
+ text = ug_latn_to_arab(text)
330
+
331
+ # Step 3: Process numbers
332
+ text = process_uyghur_text_with_numbers(text)
333
+
334
+ # Step 4: Fix punctuation and normalize
335
+ text = fix_pauctuations(text)
336
+
337
+ return text
338
+