openai
/

whisper-large

@@ -236,15 +236,13 @@ transcription.
 >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
 >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
->>> decoder_input_ids = processor.tokenizer.encode("<|startoftranscript|><|fr|><|transcribe|><|notimestamps|>", return_tensors="pt")
 >>> # load dummy dataset and read soundfiles
 >>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
 >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
 >>> input_speech = next(iter(ds))["audio"]["array"]
 >>> # tokenize
 >>> input_features = processor(input_speech, return_tensors="pt").input_features
->>> predicted_ids = model.generate(input_features, decoder_input_ids = decoder_input_ids, max_lenght = 460_000)
 >>> transcription = processor.batch_decode(predicted_ids)
 ['<|startoftranscript|><|fr|><|transcribe|><|notimestamps|> Un vrai travail intéressant va enfin être mené sur ce sujet.<|endoftext|>']
@@ -266,15 +264,15 @@ The "<|translate|>" is used as the first decoder input token to specify the tran
 >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
 >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
->>> decoder_input_ids = processor.tokenizer.encode("<|startoftranscript|><|fr|><|translate|><|notimestamps|>", return_tensors="pt")
 >>> # load dummy dataset and read soundfiles
 >>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
 >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
 >>> input_speech = next(iter(ds))["audio"]["array"]
 >>> # tokenize
 >>> input_features = processor(input_speech, return_tensors="pt").input_features
->>> predicted_ids = model.generate(input_features, decoder_input_ids = decoder_input_ids, max_lenght = 460_000)
 >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
 [' A real interesting work will be done on this subject.']
 ```

 >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
 >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
 >>> # load dummy dataset and read soundfiles
 >>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
 >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
 >>> input_speech = next(iter(ds))["audio"]["array"]
 >>> # tokenize
 >>> input_features = processor(input_speech, return_tensors="pt").input_features
+>>> predicted_ids = model.generate(input_features)
 >>> transcription = processor.batch_decode(predicted_ids)
 ['<|startoftranscript|><|fr|><|transcribe|><|notimestamps|> Un vrai travail intéressant va enfin être mené sur ce sujet.<|endoftext|>']
 >>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
 >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
 >>> # load dummy dataset and read soundfiles
 >>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
 >>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
 >>> input_speech = next(iter(ds))["audio"]["array"]
 >>> # tokenize
 >>> input_features = processor(input_speech, return_tensors="pt").input_features
+>>> forced_decoder_ids = processor._get_decoder_prompt_ids(language = "fr", task = "translate")
+>>> predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids)
 >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
 [' A real interesting work will be done on this subject.']
 ```