tini-lad

Running on Zero

App Files Files Community

Ruurd commited on Jun 5, 2025

Commit

63d4168

1 Parent(s): ec83427

Simplified interface

Browse files

Files changed (3) hide show

app.py +27 -40
infer.py +0 -90
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -16,7 +16,6 @@ from infer import (
     find_answer_start,
     get_noising_schedule,
     noisify_answer,
-    generate_diffusion_text,
     filter_logits,
     confidence_guided_noising,
     noisify_answer_without_remasking
@@ -39,17 +38,17 @@ rng = np.random.default_rng()
 def generate_diffusion_text(input_ids, top_p, top_k):
     with torch.no_grad():
         input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
-        with torch.amp.autocast('cuda', dtype=torch.float16):
             logits = model(input_ids=input_tensor)["logits"]
-        logits = filter_logits(logits, top_k=top_p, top_p=top_k)
         logits = logits.clamp(min=-1e8, max=1e4)
         probs = torch.nn.functional.softmax(logits, dim=-1)[0]
         probs = torch.clamp(probs, min=1e-8, max=1.0)
-        assert torch.all(torch.isfinite(probs)), "Non-finite values in probs!"
-        assert (probs >= 0).all(), "Negative probs!"
         sampled = torch.multinomial(probs, num_samples=1).squeeze(-1).tolist()
-        # Extract confidence of selected tokens
         conf = probs[range(len(sampled)), sampled].cpu().numpy()
     return sampled, conf
@@ -79,10 +78,14 @@ def highlight_tokens(token_ids, answer_start, changed_indices, color):
             highlighted.append(tok_str)
     return "".join(highlighted)
-def diffusion_chat(question, max_it, pause_length, sharpness,
-                   clustering, noise_start, use_confidence_noising,
-                   use_permanent_unmasking, noise_clipping, top_p,
-                   top_k):
     if question.strip() == "":
         question = "What do you know about the city of Amsterdam?"
@@ -111,6 +114,7 @@ def diffusion_chat(question, max_it, pause_length, sharpness,
     unmasked_mask = [False] * len(current_tokens)
     for i in range(max_it):
         generated_tokens, confidences = generate_diffusion_text(current_tokens, top_p, top_k)
         current_tokens = ori_input_tokens[:answer_start] + generated_tokens[answer_start:]
@@ -133,25 +137,15 @@ def diffusion_chat(question, max_it, pause_length, sharpness,
         if len(last_tokens) == 3 and last_tokens[0] == last_tokens[1] == last_tokens[2]:
             yield render_html("Stopped early", f"After {i+1} iterations.")
             break
         # NOISING
-        if i < max_it-1:
             threshold = get_noising_schedule(i, max_it, sharpness=sharpness)
-            if use_confidence_noising:
-                noised_answer, just_noised_indices = confidence_guided_noising(
-                    current_tokens, answer_start, tokenizer, confidences, noise_clipping,
-                    threshold=threshold, noise_start=noise_start
-                )
-            elif use_permanent_unmasking:
-                noised_answer, just_noised_indices = noisify_answer_without_remasking(
-                    current_tokens, answer_start, tokenizer, threshold=threshold,
-                    noise_start=noise_start, unmasked_mask=unmasked_mask
-                )
-            else:
-                noised_answer, just_noised_indices = noisify_answer(
-                    current_tokens, answer_start, tokenizer,
-                    threshold=threshold, clustering=clustering, noise_start=noise_start
-                )
             for idx in range(answer_start, len(current_tokens)):
                 if noised_answer[idx] != mask_token_id:
@@ -172,7 +166,7 @@ def diffusion_chat(question, max_it, pause_length, sharpness,
         final_ids = answer_ids
     final_output = tokenizer.decode(final_ids, skip_special_tokens=True)
-    yield render_html(f"Final Output ({len(final_ids)} tokens after {i+1} iterations)", final_output)
 def is_running_on_spaces():
@@ -197,22 +191,15 @@ print("✅ Model loaded.")
 vocab_size = len(tokenizer)
 eos_token_id = tokenizer.eos_token_id
 mask_token_id = tokenizer.encode('MASK', add_special_tokens=False)[0]
-assistant_marker_ids = tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>", add_special_tokens=False)
 demo = gr.Interface(
     fn=diffusion_chat,
     inputs=[
         gr.Textbox(label="User Question", lines=2, placeholder="What do you know about the city of Amsterdam?"),
-        gr.Slider(1, 512, value=64, step=1, label="Number of iterarions: ↑ = more iterations"),
-        gr.Slider(0.01, 5, value=0.01, step=0.01, label="Pause between iteration ↑ = longer pause"),
-        gr.Slider(1.0, 20.0, value=1.0, step=0.5, label="Noise decay sharpness: ↓ = more noise in later iterations"),
-        gr.Slider(0.0, 1.0, value=0.0, step=0.05, label="Clustering: ↑ = more clustered noising"),
-        gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Noise start fraction: ↑ = more noise"),
-        gr.Checkbox(value=False, label="Use confidence-guided noising"),
-        gr.Checkbox(value=False, label="Use permanent unmasking"),
-        gr.Slider(0.01, 1.0, value=0.01, step=0.01, label="Noise clipping: ↓ = more confidence guidance"),
-        gr.Slider(1, 1000, value = 3, step = 1, label = "Top-p: ↑ = more random answers"),
-        gr.Slider(0.0, 1.0, value = 1.0, step = 0.01, label = "Top-k: ↑ = more random answers")
     ],
     outputs=[gr.HTML(label="Diffusion Output")],
     title="Diffusion Language Model Chat",

     find_answer_start,
     get_noising_schedule,
     noisify_answer,
     filter_logits,
     confidence_guided_noising,
     noisify_answer_without_remasking
 def generate_diffusion_text(input_ids, top_p, top_k):
     with torch.no_grad():
         input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
+        with torch.cuda.amp.autocast(dtype=torch.float16):
             logits = model(input_ids=input_tensor)["logits"]
+        logits = filter_logits(logits, top_k=top_k, top_p=top_p)
         logits = logits.clamp(min=-1e8, max=1e4)
         probs = torch.nn.functional.softmax(logits, dim=-1)[0]
         probs = torch.clamp(probs, min=1e-8, max=1.0)
+        # assert torch.all(torch.isfinite(probs)), "Non-finite values in probs!"
+        # assert (probs >= 0).all(), "Negative probs!"
         sampled = torch.multinomial(probs, num_samples=1).squeeze(-1).tolist()
         conf = probs[range(len(sampled)), sampled].cpu().numpy()
     return sampled, conf
             highlighted.append(tok_str)
     return "".join(highlighted)
+def diffusion_chat(question, noising, max_it, pause_length):
+    pause_length = 0
+    sharpness = 3.0
+    noise_start = 0.5
+    top_p = 1.0
+    top_k = 10
+    clustering = False
     if question.strip() == "":
         question = "What do you know about the city of Amsterdam?"
     unmasked_mask = [False] * len(current_tokens)
     for i in range(max_it):
         generated_tokens, confidences = generate_diffusion_text(current_tokens, top_p, top_k)
         current_tokens = ori_input_tokens[:answer_start] + generated_tokens[answer_start:]
         if len(last_tokens) == 3 and last_tokens[0] == last_tokens[1] == last_tokens[2]:
             yield render_html("Stopped early", f"After {i+1} iterations.")
             break
         # NOISING
+        if i < max_it-1 and noising:
             threshold = get_noising_schedule(i, max_it, sharpness=sharpness)
+            noised_answer, just_noised_indices = noisify_answer(
+                current_tokens, answer_start, tokenizer,
+                threshold=threshold, clustering=clustering, noise_start=noise_start
+            )
             for idx in range(answer_start, len(current_tokens)):
                 if noised_answer[idx] != mask_token_id:
         final_ids = answer_ids
     final_output = tokenizer.decode(final_ids, skip_special_tokens=True)
+    yield render_html(f"Final Output ({len(final_ids)} tokens after {i+1} iterations)", final_output) # type: ignore
 def is_running_on_spaces():
 vocab_size = len(tokenizer)
 eos_token_id = tokenizer.eos_token_id
 mask_token_id = tokenizer.encode('MASK', add_special_tokens=False)[0]
+assistant_marker_ids = tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>\n", add_special_tokens=False)
 demo = gr.Interface(
     fn=diffusion_chat,
     inputs=[
         gr.Textbox(label="User Question", lines=2, placeholder="What do you know about the city of Amsterdam?"),
+        gr.Checkbox(label="Enable noising", value=True, info="If disabled, the model will not apply any intermediate noise."),
+        gr.Slider(1, 512, value=64, step=1, label="Increase the maximum number of iterations to run."),
+        gr.Slider(0, 5, value=0, step=0.01, label="Increase the pause between iterations to visualize the process.")
     ],
     outputs=[gr.HTML(label="Diffusion Output")],
     title="Diffusion Language Model Chat",

infer.py CHANGED Viewed

@@ -190,26 +190,6 @@ def confidence_guided_noising(input_ids, answer_start, tokenizer, confidences, n
     noised_indices = sorted(noised_indices)
     return noised, noised_indices
-def generate_diffusion_text(model, input_ids, answer_start, top_k=0, top_p=1.0, temperature=1.0,
-                            eos_token_id=None, eos_boost=0.0):
-    model.eval()
-    with torch.no_grad(), torch.autocast(device_type='cuda', dtype=torch.bfloat16):
-        input_tensor = torch.tensor([input_ids], dtype=torch.long).to(model.device)
-        logits = model(input_ids=input_tensor)["logits"]  # (1, seq_len, vocab_size)
-        # Optionally boost or suppress EOS token
-        if eos_token_id is not None and eos_boost != 0.0:
-            logits[:, :, eos_token_id] += eos_boost
-        # Filter and sample
-        filtered_logits = filter_logits(logits, top_k=top_k, top_p=top_p, temperature=temperature)
-        probs = F.softmax(filtered_logits, dim=-1).squeeze()  # (seq_len, vocab_size)
-        probs = torch.clamp(probs, min=1e-8, max=1.0)
-        sampled = torch.multinomial(probs, num_samples=1).squeeze(-1)
-        confidences = probs.gather(1, sampled.unsqueeze(-1)).squeeze(-1)
-    return input_ids[:answer_start] + sampled[answer_start:].tolist(), confidences
 def calculate_answer_perplexity(prompt, answer, model_name='gpt2-large'):
     from transformers import AutoTokenizer, AutoModelForCausalLM
@@ -277,73 +257,3 @@ def save_html_colored_output(filename, html_content):
         </body>
         </html>
         """)
-def generate_answer(question: str, model, tokenizer, max_it=16, noise_start=0.5,
-                    noising_sharpness=5.0, max_length=256, top_k=100, top_p=1.0,
-                    temperature=1.0, eos_token_id = None, eos_boost = 0.0) -> str:
-    if eos_token_id is None:
-        eos_token_id = tokenizer.eos_token_id
-    # Format prompt with LLaMA 3 chat template
-    prompt = (
-        "<|begin_of_text|>\n"
-        "<|start_header_id|>system<|end_header_id|>\n"
-        "You are a helpful assistant.\n"
-        "<|eot_id|>\n"
-        "<|start_header_id|>user<|end_header_id|>\n"
-        f"{question.strip()}\n"
-        "<|start_header_id|>assistant<|end_header_id|>\n"
-    )
-    input_ids = tokenizer.encode(prompt, add_special_tokens=False)
-    marker = tokenizer.encode("<|start_header_id|>assistant<|end_header_id|>\n", add_special_tokens=False)
-    def find_answer_start(ids, marker):
-        for i in range(len(ids) - len(marker) + 1):
-            if ids[i:i+len(marker)] == marker:
-                return i + len(marker)
-        return None
-    answer_start = find_answer_start(input_ids, marker)
-    if answer_start is None:
-        raise ValueError("Assistant marker not found in prompt.")
-    # Pad to max length
-    pad_token = tokenizer.eos_token_id
-    mask_token = tokenizer.encode("MASK", add_special_tokens=False)[0]
-    input_ids = input_ids[:max_length]
-    if len(input_ids) < max_length:
-        input_ids += [mask_token] * (max_length - len(input_ids))
-    ori_tokens = input_ids
-    current_tokens = noisify_answer(ori_tokens, answer_start, threshold=1.0, mask_token_id=mask_token)
-    last_tokens = []
-    for step in range(max_it):
-        # Generate a new prediction
-        current_tokens, confidence_scores = generate_diffusion_text(
-            model, current_tokens, answer_start,
-            top_k=top_k, top_p=top_p, temperature=temperature,
-            eos_token_id=eos_token_id, eos_boost=eos_boost
-        )
-        # Display for debugging / tracking
-        display_diffusion_output(
-            step, max_it, question,
-            ori_tokens, current_tokens, confidence_scores,
-            answer_start, tokenizer
-        )
-        # Early stopping
-        last_tokens.append(current_tokens)
-        if len(last_tokens) > 4:
-            last_tokens.pop(0)
-            if all(t == last_tokens[0] for t in last_tokens):
-                break
-        # Re-apply noise for next iteration
-        if step < max_it - 1:
-            threshold = noise_start * get_noising_schedule(step, max_it, sharpness=noising_sharpness)
-            current_tokens = noisify_answer(current_tokens, answer_start, threshold=threshold, mask_token_id=mask_token)
-    return tokenizer.decode(current_tokens[answer_start:], skip_special_tokens=True).strip()

     noised_indices = sorted(noised_indices)
     return noised, noised_indices
 def calculate_answer_perplexity(prompt, answer, model_name='gpt2-large'):
     from transformers import AutoTokenizer, AutoModelForCausalLM
         </body>
         </html>
         """)

requirements.txt CHANGED Viewed

@@ -7,3 +7,4 @@ gradio>=4.10.0
 numpy
 load_dotenv
 ipython

 numpy
 load_dotenv
 ipython
+spaces