Gggggggccccfggggfg

Paused

App Files Files Community

jnjj commited on Apr 23

Commit

e58f514

verified ·

1 Parent(s): 7dcc63f

Update app.py

Browse files

Files changed (1) hide show

app.py +145 -107

app.py CHANGED Viewed

@@ -231,7 +231,7 @@ def get_stopping_criteria(req: GenerateRequest, initial_ids: torch.Tensor, token
     if req.max_length is not None and req.max_length > 0:
          max_len_from_req = req.max_length
          if max_len_from_req <= initial_len:
-              logger.warning(f"Requested max_length ({req.max_length}) is less than or equal to prompt length ({initial_len}). Generation will stop immediately.")
     elif req.max_new_tokens is not None and req.max_new_tokens > 0:
          max_len_from_req = initial_len + req.max_new_tokens
          if model_max_len is not None:
@@ -264,111 +264,141 @@ def get_stopping_criteria(req: GenerateRequest, initial_ids: torch.Tensor, token
              logger.error(f"Failed to create StopSequenceCriteria: {e}", exc_info=True)
              raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to create StopSequenceCriteria: {e}")
     return criteria
 async def stream_generation_logic(req: GenerateRequest, initial_ids: torch.Tensor, gen_cfg: GenerationConfig, device: str) -> AsyncGenerator[str, None]:
-    initial_len = initial_ids.shape[-1]
-    full_sequence: List[int] = initial_ids.tolist()[0]
     generated_tokens_count = 0
-    start_time = time.time()
-    finish_reason = "unknown"
-    eos_token_id = gen_cfg.eos_token_id
-    pad_token_id = gen_cfg.pad_token_id
     stop_token_ids = set()
     if eos_token_id is not None:
         stop_token_ids.add(eos_token_id)
     if pad_token_id is not None and pad_token_id != eos_token_id:
         stop_token_ids.add(pad_token_id)
-    stopping_criteria_list = get_stopping_criteria(req, initial_ids.to('cpu'), global_tokenizer)
-    stop_sequence_criteria = None
-    for crit in stopping_criteria_list:
-        if isinstance(crit, StopSequenceCriteria):
-            stop_sequence_criteria = crit
-            break
-    gen_cfg.use_cache = True
-    gen_cfg.num_beams = 1
-    gen_cfg.num_return_sequences = 1
-    gen_cfg.num_beam_groups = 1
     model_total_capacity = getattr(global_model.config, 'max_position_embeddings', None)
     if model_total_capacity is None:
         model_total_capacity = MAX_CONTEXT_TOKENS + MAX_GENERATION_TOKENS
     effective_max_total_length = req.max_length if req.max_length is not None else initial_len + req.max_new_tokens
     if effective_max_total_length > model_total_capacity:
         effective_max_total_length = model_total_capacity
-    logger.info(f"Starting stream generation (using HF stream): max_new_tokens={req.max_new_tokens}, max_length={req.max_length}, max_time={req.max_time}, initial_len={initial_len}, effective_max_total_length={effective_max_total_length}")
     try:
-        last_sequence_len = initial_len
-        async for generation_output in global_model.generate(
-            input_ids=initial_ids.to(global_model.device),
-            generation_config=gen_cfg,
-            temperature=req.temperature,
-            top_k=req.top_k,
-            top_p=req.top_p,
-            repetition_penalty=req.repetition_penalty,
-            frequency_penalty=req.frequency_penalty,
-            presence_penalty=req.presence_penalty,
-            do_sample=req.do_sample,
-            forced_bos_token_id=req.forced_bos_token_id,
-            forced_eos_token_id=req.forced_eos_token_id,
-            encoder_no_repeat_ngram_size=req.encoder_no_repeat_ngram_size,
-            exponential_decay_length_penalty=req.exponential_decay_length_penalty,
-            typical_p=req.typical_p,
-            encoder_repetition_penalty=req.encoder_repetition_penalty,
-            diversity_penalty=req.diversity_penalty,
-            length_normalization_factor=req.length_normalization_factor,
-            min_new_tokens=req.min_new_tokens,
-            do_normalize_logits=req.do_normalize_logits,
-            stream=True,
-            stopping_criteria=stopping_criteria_list if stopping_criteria_list else None,
-        ):
-            current_sequence: List[int] = generation_output.sequences[0].tolist()
-            new_tokens_ids = current_sequence[last_sequence_len:]
-            if not new_tokens_ids:
-                 continue
-            text = global_tokenizer.decode(new_tokens_ids, skip_special_tokens=True)
             text = filter_unwanted_json_fragments(text)
-            full_sequence.extend(new_tokens_ids)
-            generated_tokens_count += len(new_tokens_ids)
-            last_sequence_len = len(current_sequence)
             chunk_payload: Dict[str, Any] = {
                 "type": "token",
                 "text": text,
-                "token_ids": new_tokens_ids,
-                "generated_tokens_count": generated_tokens_count,
             }
             yield json.dumps(chunk_payload) + "\n"
-            if req.max_time is not None and (time.time() - start_time) > req.max_time:
-                 finish_reason = "time"
-                 break
-            if generated_tokens_count >= req.max_new_tokens:
-                finish_reason = "max_new_tokens"
                 break
-            if stop_sequence_criteria and stop_sequence_criteria(torch.tensor([full_sequence], device='cpu'), None):
-                 finish_reason = "stop_sequence"
                  break
-        final_sequence_ids = full_sequence
-        if finish_reason == "unknown":
-             if generated_tokens_count > 0:
-                  last_token = final_sequence_ids[-1]
-                  if eos_token_id is not None and last_token == eos_token_id:
-                       finish_reason = "eos_token"
-                  elif pad_token_id is not None and last_token == pad_token_id:
-                       finish_reason = "pad_token"
-                  elif req.max_new_tokens is not None and generated_tokens_count >= req.max_new_tokens:
-                       finish_reason = "max_new_tokens"
-                  elif gen_cfg.max_length is not None and len(final_sequence_ids) >= gen_cfg.max_length:
-                       finish_reason = "max_length"
-                  elif finish_reason == "unknown":
-                      finish_reason = "completed"
-        final_text_raw = global_tokenizer.decode(final_sequence_ids[initial_len:], skip_special_tokens=True)
         final_text_raw = filter_unwanted_json_fragments(final_text_raw)
         final_payload: Dict[str, Any] = {
             "type": "done",
             "total_prompt_tokens": initial_len,
             "total_generated_tokens": generated_tokens_count,
-            "total_sequence_tokens": len(final_sequence_ids),
             "final_text": final_text_raw,
             "finish_reason": finish_reason,
         }
-        logger.info(f"Stream generation finished. Reason: {finish_reason}. Total tokens: {len(final_sequence_ids)}.")
         yield json.dumps(final_payload) + "\n"
     except Exception as e:
          logger.error("Stream generation error", exc_info=True)
@@ -384,28 +414,42 @@ async def generate_full_response(req: GenerateRequest, initial_ids: torch.Tensor
     accumulated_text = ""
     finish_reason = "unknown"
     total_generated_count = 0
-    try:
-        async for chunk_json in stream_generation_logic(req, initial_ids, gen_cfg, device):
-            try:
-                data = json.loads(chunk_json)
-                if data.get("type") == "token":
-                    token_ids_chunk = data.get("token_ids", [])
-                    text_chunk = data.get("text", "")
-                    accumulated_tokens.extend(token_ids_chunk)
-                    accumulated_text += text_chunk
-                    total_generated_count = data["generated_tokens_count"]
-                elif data.get("type") == "done":
-                    finish_reason = data.get("finish_reason", "done")
-                    final_text_part = data.get("final_text", "")
-                    accumulated_text = accumulated_text + final_text_part
-                    break
-                elif data.get("type") == "error":
-                    raise RuntimeError(f"Error during streaming generation: {data.get('message', 'Unknown error')}")
-            except json.JSONDecodeError:
-                logger.warning(f"Failed to decode JSON chunk: {chunk_json.strip()}")
-    except Exception as e:
-        logger.error("Error during full response generation from stream", exc_info=True)
-        raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Generation error: {e}")
     full_sequence_ids_list = initial_ids.tolist()[0] + accumulated_tokens
     final_payload: Dict[str, Any] = {
          "prompt_tokens": initial_ids.shape[-1],
@@ -417,6 +461,7 @@ async def generate_full_response(req: GenerateRequest, initial_ids: torch.Tensor
              "full_sequence_token_ids": full_sequence_ids_list
          }],
          "total_tokens": initial_ids.shape[-1] + total_generated_count,
     }
     logger.info(f"Full response generation finished. Reason: {finish_reason}. Total tokens: {final_payload['total_tokens']}.")
     return final_payload
@@ -447,12 +492,10 @@ async def load_model():
     device = "cpu"
     if torch.cuda.is_available():
         device = "cuda"
-        logger.info(f"Using device: {device}")
     elif torch.backends.mps.is_available():
          device = "mps"
-         logger.info(f"Using device: {device}")
     else:
-        logger.info(f"Using device: {device}")
     current_model_name = MODEL_NAME
     current_trust_remote_code = TRUST_REMOTE_CODE
     try:
@@ -467,28 +510,23 @@ async def load_model():
         global_model = AutoModelForCausalLM.from_pretrained(current_model_name, **model_kwargs)
         if 'device_map' not in model_kwargs or model_kwargs['device_map'] is None:
              global_model.to(device)
-             logger.info(f"Manually moved model to device: {device}")
         else:
              model_device = next(global_model.parameters()).device
         global_model.eval()
         global_tokens["eos_token_id"] = global_tokenizer.eos_token_id
         global_tokens["pad_token_id"] = global_tokenizer.pad_token_id
         global_tokens["bos_token_id"] = global_tokenizer.bos_token_id
-        logger.info(f"Tokenizer IDs: EOS={global_tokens['eos_token_id']}, PAD={global_tokens['pad_token_id']}, BOS={global_tokens['bos_token_id']}")
         if global_model.config.pad_token_id is None and global_tokens["pad_token_id"] is None:
              if global_tokens["eos_token_id"] is not None:
                  global_tokenizer.pad_token_id = global_tokens["eos_token_id"]
                  global_model.config.pad_token_id = global_tokens["eos_token_id"]
                  global_tokens["pad_token_id"] = global_tokens["eos_token_id"]
-                 logger.warning(f"Model/Tokenizer pad_token_id not set. Using eos_token_id: {global_tokens['pad_token_id']}")
              else:
-                  logger.warning("Neither EOS nor PAD tokens are available for this tokenizer/model. Padding for batching/beam search might not work.")
         elif global_model.config.pad_token_id is None and global_tokens["pad_token_id"] is not None:
              global_model.config.pad_token_id = global_tokens["pad_token_id"]
-             logger.info(f"Model config pad_token_id was None, set from tokenizer: {global_tokens['pad_token_id']}")
         elif global_model.config.pad_token_id is not None and global_tokens["pad_token_id"] is None:
              global_tokens["pad_token_id"] = global_model.config.pad_token_id
-             logger.warning(f"Tokenizer pad_token_id was None, set from model config: {global_tokens['pad_token_id']}")
         logger.info("Model and tokenizer loaded successfully.")
         logger.info(f"Model device: {next(global_model.parameters()).device}")
     except Exception as e:

     if req.max_length is not None and req.max_length > 0:
          max_len_from_req = req.max_length
          if max_len_from_req <= initial_len:
+              pass
     elif req.max_new_tokens is not None and req.max_new_tokens > 0:
          max_len_from_req = initial_len + req.max_new_tokens
          if model_max_len is not None:
              logger.error(f"Failed to create StopSequenceCriteria: {e}", exc_info=True)
              raise HTTPException(status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Failed to create StopSequenceCriteria: {e}")
     return criteria
+def generate_next_token_sync(
+    input_ids: torch.Tensor,
+    past_key_values: Optional[Tuple],
+    gen_cfg: GenerationConfig,
+    device: str
+) -> Tuple[torch.Tensor, Any, torch.Tensor]:
+    model_input_ids = input_ids.to(global_model.device)
+    model_past_key_values = past_key_values
+    with torch.no_grad():
+        outputs = global_model(
+            input_ids=model_input_ids,
+            past_key_values=model_past_key_values,
+            use_cache=gen_cfg.use_cache,
+            return_dict=True
+        )
+        logits = outputs.logits[:, -1, :]
+        past = outputs.past_key_values
+        if gen_cfg.do_sample:
+            if gen_cfg.temperature > 1e-8:
+                 logits = logits / gen_cfg.temperature
+            if gen_cfg.top_k and gen_cfg.top_k > 0:
+                topk_values, topk_indices = torch.topk(logits, gen_cfg.top_k)
+                logits[logits < topk_values[:, -1]] = -float('Inf')
+            if gen_cfg.top_p < 1.0 - 1e-8:
+                 sorted_logits, sorted_indices = torch.sort(logits, dim=-1, descending=True)
+                 cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                 sorted_indices_to_remove = cumulative_probs > gen_cfg.top_p
+                 sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                 sorted_indices_to_remove[..., 0] = False
+                 indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                 logits[:, indices_to_remove] = -float('Inf')
+            token = torch.multinomial(torch.softmax(logits, dim=-1), 1)
+        else:
+            token = torch.argmax(logits, dim=-1, keepdim=True)
+    return token.to('cpu'), past, logits.to('cpu')
 async def stream_generation_logic(req: GenerateRequest, initial_ids: torch.Tensor, gen_cfg: GenerationConfig, device: str) -> AsyncGenerator[str, None]:
+    past = None
     generated_tokens_count = 0
+    eos_token_id = req.eos_token_id_override if req.eos_token_id_override is not None else global_tokens.get("eos_token_id")
+    pad_token_id = req.pad_token_id_override if req.pad_token_id_override is not None else global_tokens.get("pad_token_id")
     stop_token_ids = set()
     if eos_token_id is not None:
         stop_token_ids.add(eos_token_id)
     if pad_token_id is not None and pad_token_id != eos_token_id:
         stop_token_ids.add(pad_token_id)
+    current_ids = initial_ids.to(device)
+    initial_len = initial_ids.shape[-1]
+    total_ids_list = initial_ids.tolist()[0]
+    start_time = time.time()
+    finish_reason = "unknown"
+    all_stopping_criteria = get_stopping_criteria(req, initial_ids.to('cpu'), global_tokenizer)
+    stream_stopping_criteria = StoppingCriteriaList([
+        crit for crit in all_stopping_criteria
+        if isinstance(crit, (MaxLengthCriteria, StopSequenceCriteria))
+    ])
+    last_step_logits = None
     model_total_capacity = getattr(global_model.config, 'max_position_embeddings', None)
     if model_total_capacity is None:
         model_total_capacity = MAX_CONTEXT_TOKENS + MAX_GENERATION_TOKENS
     effective_max_total_length = req.max_length if req.max_length is not None else initial_len + req.max_new_tokens
     if effective_max_total_length > model_total_capacity:
         effective_max_total_length = model_total_capacity
+    current_segment_start_ids = initial_ids.clone()
+    current_segment_initial_len = initial_len
+    current_segment_generated_count = 0
+    yielded_segments = 0
+    logger.info(f"Starting stream generation: max_new_tokens (soft limit)={req.max_new_tokens}, max_length (effective total)={effective_max_total_length}, max_time={req.max_time}, initial_len={initial_len}")
     try:
+        while True:
+            if req.max_time is not None and (time.time() - start_time) > req.max_time:
+                 finish_reason = "time"
+                 logger.info(f"Stopping stream generation: {finish_reason} reached (>{req.max_time} seconds).")
+                 break
+            current_total_len = len(total_ids_list)
+            if current_total_len >= effective_max_total_length:
+                finish_reason = "max_length_reached"
+                logger.info(f"Stopping stream generation: {finish_reason} ({current_total_len} tokens).")
+                break
+            input_ids_sync = current_ids if past is None else token.to(device)
+            token, past, step_logits = await asyncio.to_thread(
+                generate_next_token_sync,
+                input_ids_sync,
+                past,
+                gen_cfg,
+                device
+            )
+            last_step_logits = step_logits
+            generated_token_id = token[0].item()
+            total_ids_list.append(generated_token_id)
+            text = global_tokenizer.decode([generated_token_id], skip_special_tokens=True)
             text = filter_unwanted_json_fragments(text)
             chunk_payload: Dict[str, Any] = {
                 "type": "token",
                 "text": text,
+                "token_id": generated_token_id,
+                "generated_tokens_count": generated_tokens_count + 1,
+                "segment": yielded_segments,
+                "segment_token_count": current_segment_generated_count + 1,
             }
             yield json.dumps(chunk_payload) + "\n"
+            generated_tokens_count += 1
+            current_segment_generated_count += 1
+            if generated_token_id in stop_token_ids:
+                finish_reason = "eos_token" if generated_token_id == eos_token_id else "pad_token"
+                logger.info(f"Stopping stream generation: Stop token {generated_token_id} ({finish_reason}) generated.")
                 break
+            current_full_ids_tensor = torch.tensor([total_ids_list], device='cpu')
+            if stream_stopping_criteria(current_full_ids_tensor, last_step_logits):
+                 criteria_finish = "stopping_criteria"
+                 if any(isinstance(c, MaxLengthCriteria) for c in stream_stopping_criteria):
+                      max_len_crit_met = any(len(total_ids_list) >= c.max_length_seq for c in stream_stopping_criteria if isinstance(c, MaxLengthCriteria))
+                      if max_len_crit_met:
+                           criteria_finish = "max_length"
+                 stop_seq_crit_met = any(isinstance(c, StopSequenceCriteria) for c in stream_stopping_criteria) and req.stop_sequences
+                 if stop_seq_crit_met:
+                      generated_text_so_far = global_tokenizer.decode(total_ids_list[initial_len:], skip_special_tokens=True)
+                      generated_text_so_far = filter_unwanted_json_fragments(generated_text_so_far)
+                      if any(seq and seq in generated_text_so_far for seq in req.stop_sequences):
+                           criteria_finish = "stop_sequence"
+                 finish_reason = criteria_finish
+                 logger.info(f"Stopping stream generation: {finish_reason} criteria met.")
                  break
+            current_ids = token.to(device)
+        final_text_raw = global_tokenizer.decode(total_ids_list[initial_len:], skip_special_tokens=True)
         final_text_raw = filter_unwanted_json_fragments(final_text_raw)
         final_payload: Dict[str, Any] = {
             "type": "done",
             "total_prompt_tokens": initial_len,
             "total_generated_tokens": generated_tokens_count,
+            "total_sequence_tokens": len(total_ids_list),
             "final_text": final_text_raw,
             "finish_reason": finish_reason,
+            "segment": yielded_segments,
         }
+        logger.info(f"Stream generation finished. Reason: {finish_reason}. Total tokens: {len(total_ids_list)}. Segment: {yielded_segments}")
         yield json.dumps(final_payload) + "\n"
     except Exception as e:
          logger.error("Stream generation error", exc_info=True)
     accumulated_text = ""
     finish_reason = "unknown"
     total_generated_count = 0
+    segments_data = []
+    current_segment_tokens = []
+    current_segment_text = ""
+    current_segment_generated_count = 0
+    segment_index = 0
+    async for chunk_json in stream_generation_logic(req, initial_ids, gen_cfg, device):
+        try:
+            data = json.loads(chunk_json)
+            if data.get("type") == "token":
+                token_id = data.get("token_id")
+                text = data.get("text", "")
+                if token_id is not None:
+                    accumulated_tokens.append(token_id)
+                    current_segment_tokens.append(token_id)
+                accumulated_text += text
+                current_segment_text += text
+                total_generated_count = data.get("generated_tokens_count", total_generated_count + 1)
+                current_segment_generated_count = data.get("segment_token_count", current_segment_generated_count + 1)
+            elif data.get("type") == "done":
+                finish_reason = data.get("finish_reason", "done")
+                final_segment_text = data.get("final_text", "")
+                final_segment_text = filter_unwanted_json_fragments(final_segment_text)
+                accumulated_text = filter_unwanted_json_fragments(accumulated_text) + final_segment_text
+                current_segment_text = filter_unwanted_json_fragments(current_segment_text) + final_segment_text
+                segments_data.append({
+                    "segment": segment_index,
+                    "text": current_segment_text,
+                    "token_ids": current_segment_tokens,
+                    "generated_tokens_count": current_segment_generated_count,
+                    "finish_reason": finish_reason if finish_reason != "max_new_tokens_segment" else "completed_segment"
+                })
+                break
+            elif data.get("type") == "error":
+                raise RuntimeError(f"Error during streaming generation: {data.get('message', 'Unknown error')}")
+        except json.JSONDecodeError:
+            logger.warning(f"Failed to decode JSON chunk: {chunk_json.strip()}")
     full_sequence_ids_list = initial_ids.tolist()[0] + accumulated_tokens
     final_payload: Dict[str, Any] = {
          "prompt_tokens": initial_ids.shape[-1],
              "full_sequence_token_ids": full_sequence_ids_list
          }],
          "total_tokens": initial_ids.shape[-1] + total_generated_count,
+         "segments": segments_data if segments_data else None
     }
     logger.info(f"Full response generation finished. Reason: {finish_reason}. Total tokens: {final_payload['total_tokens']}.")
     return final_payload
     device = "cpu"
     if torch.cuda.is_available():
         device = "cuda"
     elif torch.backends.mps.is_available():
          device = "mps"
     else:
+        device = "cpu"
     current_model_name = MODEL_NAME
     current_trust_remote_code = TRUST_REMOTE_CODE
     try:
         global_model = AutoModelForCausalLM.from_pretrained(current_model_name, **model_kwargs)
         if 'device_map' not in model_kwargs or model_kwargs['device_map'] is None:
              global_model.to(device)
         else:
              model_device = next(global_model.parameters()).device
         global_model.eval()
         global_tokens["eos_token_id"] = global_tokenizer.eos_token_id
         global_tokens["pad_token_id"] = global_tokenizer.pad_token_id
         global_tokens["bos_token_id"] = global_tokenizer.bos_token_id
         if global_model.config.pad_token_id is None and global_tokens["pad_token_id"] is None:
              if global_tokens["eos_token_id"] is not None:
                  global_tokenizer.pad_token_id = global_tokens["eos_token_id"]
                  global_model.config.pad_token_id = global_tokens["eos_token_id"]
                  global_tokens["pad_token_id"] = global_tokens["eos_token_id"]
              else:
+                  pass
         elif global_model.config.pad_token_id is None and global_tokens["pad_token_id"] is not None:
              global_model.config.pad_token_id = global_tokens["pad_token_id"]
         elif global_model.config.pad_token_id is not None and global_tokens["pad_token_id"] is None:
              global_tokens["pad_token_id"] = global_model.config.pad_token_id
         logger.info("Model and tokenizer loaded successfully.")
         logger.info(f"Model device: {next(global_model.parameters()).device}")
     except Exception as e: