File size: 9,360 Bytes
1284e28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b92d046f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['VLLM_USE_V1'] = '0'\n",
    "os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'\n",
    "os.environ[\"VLLM_LOGGING_LEVEL\"] = \"ERROR\"\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = \"0\"\n",
    "import torch\n",
    "import warnings\n",
    "import numpy as np\n",
    "\n",
    "warnings.filterwarnings('ignore')\n",
    "warnings.filterwarnings('ignore', category=DeprecationWarning)\n",
    "warnings.filterwarnings('ignore', category=FutureWarning)\n",
    "warnings.filterwarnings('ignore', category=UserWarning)\n",
    "\n",
    "from qwen_omni_utils import process_mm_info\n",
    "from transformers import Qwen3OmniMoeProcessor\n",
    "\n",
    "def _load_model_processor():\n",
    "    if USE_TRANSFORMERS:\n",
    "        from transformers import Qwen3OmniMoeForConditionalGeneration\n",
    "        if TRANSFORMERS_USE_FLASH_ATTN2:\n",
    "            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH,\n",
    "                                                                         dtype='auto',\n",
    "                                                                         attn_implementation='flash_attention_2',\n",
    "                                                                         device_map=\"auto\")\n",
    "        else:\n",
    "            model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(MODEL_PATH, device_map=\"auto\", dtype='auto')\n",
    "    else:\n",
    "        from vllm import LLM\n",
    "        model = LLM(\n",
    "            model=MODEL_PATH, trust_remote_code=True, gpu_memory_utilization=0.95,\n",
    "            tensor_parallel_size=torch.cuda.device_count(),\n",
    "            limit_mm_per_prompt={'image': 1, 'video': 3, 'audio': 3},\n",
    "            max_num_seqs=1,\n",
    "            max_model_len=8192,\n",
    "            seed=1234,\n",
    "        )\n",
    "\n",
    "    processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)\n",
    "    return model, processor\n",
    "\n",
    "def run_model(model, processor, messages, return_audio, use_audio_in_video):\n",
    "    if USE_TRANSFORMERS:\n",
    "        text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)\n",
    "        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)\n",
    "        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors=\"pt\", padding=True, use_audio_in_video=use_audio_in_video)\n",
    "        inputs = inputs.to(model.device).to(model.dtype)\n",
    "        text_ids, audio = model.generate(**inputs,\n",
    "                                            thinker_return_dict_in_generate=True,\n",
    "                                            thinker_max_new_tokens=8192,\n",
    "                                            thinker_do_sample=True,\n",
    "                                            thinker_top_p=0.95,\n",
    "                                            thinker_top_k=20,\n",
    "                                            thinker_temperature=0.6,\n",
    "                                            speaker=\"Chelsie\",\n",
    "                                            use_audio_in_video=use_audio_in_video,\n",
    "                                            return_audio=return_audio)\n",
    "        response = processor.batch_decode(text_ids.sequences[:, inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
    "        if audio is not None:\n",
    "            audio = np.array(audio.reshape(-1).detach().cpu().numpy() * 32767).astype(np.int16)\n",
    "        return response, audio\n",
    "    else:\n",
    "        from vllm import SamplingParams\n",
    "        sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=20, max_tokens=4096)\n",
    "        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
    "        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)\n",
    "        inputs = {'prompt': text, 'multi_modal_data': {}, \"mm_processor_kwargs\": {\"use_audio_in_video\": use_audio_in_video}}\n",
    "        if images is not None: inputs['multi_modal_data']['image'] = images\n",
    "        if videos is not None: inputs['multi_modal_data']['video'] = videos\n",
    "        if audios is not None: inputs['multi_modal_data']['audio'] = audios\n",
    "        outputs = model.generate(inputs, sampling_params=sampling_params)\n",
    "        response = outputs[0].outputs[0].text\n",
    "        return response, None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d37dcedc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import librosa\n",
    "import audioread\n",
    "\n",
    "from IPython.display import Audio\n",
    "\n",
    "MODEL_PATH = \"NandemoGHS/Anime-Speech-Japanese-Refiner-FP8-DYNAMIC\"\n",
    "\n",
    "USE_TRANSFORMERS = False\n",
    "TRANSFORMERS_USE_FLASH_ATTN2 = True\n",
    "\n",
    "model, processor = _load_model_processor()\n",
    "\n",
    "USE_AUDIO_IN_VIDEO = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5bf60bf5",
   "metadata": {},
   "outputs": [],
   "source": [
    "audio_path = \"https://huggingface.co/NandemoGHS/Anime-Speech-Japanese-Refiner/resolve/main/examples/example1.wav\"\n",
    "\n",
    "original_transcription = \"あっ、あぁんっ、好き、大好きですわ…。もっと…はぁ、んんっ、はぁんっ、もっとぉ!\"\n",
    "\n",
    "prompt = f\"\"\"これから与えられる音声クリップとその文字起こしについて、声の特徴と読み上げスタイル、感情などをアノテーションしたうえで、日本語の短いキャプションで要約してください。\n",
    "出力には以下の項目を含めてください。\n",
    "\n",
    "profile: 話者プロファイル(例: お姉さん的な女性声/落ち着いた男性声/少女声 等)\n",
    "mood: 感情・ムード(例: 明るい/落ち着いた/緊張/怒り/恐怖/悲しみ/快楽 等)\n",
    "speed: 話速(例: とても遅い/やや速い/一定/(1.2×) 等)\n",
    "prosody: 抑揚・リズム(例: 平坦/メリハリ/語尾上げ下げ/ため息混じり 等)\n",
    "pitch_timbre: ピッチ/声質(例: 高め/低め/息多め/張りのある/囁き 等)\n",
    "style: 発話スタイル(例: ナレーション風/会話調/朗読調/プレゼン調/囁き/喘ぎ/嗚咽/叫び 等)\n",
    "emotion: 感情タグ(次のリストから1つ選択: [\"angry\", \"sad\", \"disdainful\", \"excited\", \"surprised\", \"satisfied\", \"unhappy\", \"anxious\", \"hysterical\", \"delighted\", \"scared\", \"worried\", \"indifferent\", \"upset\", \"impatient\", \"nervous\", \"guilty\", \"scornful\", \"frustrated\", \"depressed\", \"panicked\", \"furious\", \"empathetic\", \"embarrassed\", \"reluctant\", \"disgusted\", \"keen\", \"moved\", \"proud\", \"relaxed\", \"grateful\", \"confident\", \"interested\", \"curious\", \"confused\", \"joyful\", \"disapproving\", \"negative\", \"denying\", \"astonished\", \"serious\", \"sarcastic\", \"conciliative\", \"comforting\", \"sincere\", \"sneering\", \"hesitating\", \"yielding\", \"painful\", \"awkward\", \"amused\", \"loving\", \"dating\", \"longing\", \"aroused\", \"seductive\", \"ecstatic\", \"shy\"])\n",
    "notes: 特記事項(間の取り方、笑い・ため・ブレス、ノイズ感、キス音、効果音、チュパ音 等)\n",
    "caption: 上記を1〜2文・全角30〜80文字で自然文に要約\n",
    "refined_text: 元の文字起こしテキストに、必要に応じて特殊タグを音声中のイベントの描写として文章のどこかに挿入したもの(必要なければ元テキストをそのまま出力)。\n",
    "\n",
    "元の文字起こしテキスト: {original_transcription}\n",
    "元の音声クリップ:\"\"\"\n",
    "\n",
    "messages = [\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\"type\": \"text\", \"text\": prompt},\n",
    "            {\"type\": \"audio\", \"audio\": audio_path},\n",
    "        ]\n",
    "    }\n",
    "]\n",
    "\n",
    "display(Audio(librosa.load(audioread.ffdec.FFmpegAudioFile(audio_path), sr=16000)[0], rate=16000))\n",
    "\n",
    "response, _ = run_model(model=model, messages=messages, processor=processor, return_audio=False, use_audio_in_video=USE_AUDIO_IN_VIDEO)\n",
    "\n",
    "print(response)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv (3.10.12)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}