08/01/2026 patch

2026-01-08 21:32:23 +01:00
parent 10f38eebcf
commit b426fd87d2
6 changed files with 614 additions and 224 deletions
--- a/gwen.py
+++ b/gwen.py
@@ -1,8 +1,6 @@
 from sys import stderr as err
 from sys import argv

-## Streaming generation on GPU (CUDA) with TextIteratorStreamer
-
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
@@ -12,7 +10,6 @@ import threading
 import torch

 model_name = "Qwen/Qwen3-8B-FP8" 
-# model_name = "Qwen/Qwen3-8B"

 # 1) Choose device (use CUDA if available)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -26,7 +23,6 @@ model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    dtype=torch.float16 if device.type == "cuda" else None, 
    device_map=device)
-model.to(device)

 print("tokenizer.model_max_length =", tokenizer.model_max_length, file=err)
 print("model.config.max_position_embeddings =", model.config.max_position_embeddings, file=err)
@@ -45,6 +41,10 @@ inputs = tokenizer.apply_chat_template(
    return_tensors="pt",
 )

+num_input_tokens = inputs["input_ids"].shape[1] 
+tokens = num_input_tokens
+print("input tokens =", num_input_tokens, file=err)
+
 # Move input tensors to the same device as the model
 inputs = {k: v.to(device) for k, v in inputs.items()}

@@ -62,11 +62,11 @@ thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)
 thread.start()

 # 6) Consume and display streamed text in real time
-generated_text = ""
 for chunk in streamer:
-    generated_text += chunk
+    tokens += len(tokenizer.encode(chunk, add_special_tokens=False))
    print(chunk, end="", flush=True)
+    print(tokens, "/131072 of token limit", end="\r", sep="", file=err)
+print()

 thread.join()
 print()  # final newline
-