From 750f067b7527cc4a95e220cf80bdf8cfbbed8413 Mon Sep 17 00:00:00 2001 From: tukaunu Date: Fri, 9 Jan 2026 04:48:16 +0100 Subject: [PATCH] moved finished notes to own folder; started work on fine tuning --- dataset/newsdownloader.py | 76 ++++++++ gwen.py | 19 +- newsdownloader.py | 42 ----- 08jan2026.txt => patchnotes/08jan2026.txt | 0 25dec2025.txt => patchnotes/25dec2025.txt | 0 train.ipynb | 212 ++++++++++++++++++++++ 6 files changed, 297 insertions(+), 52 deletions(-) create mode 100644 dataset/newsdownloader.py delete mode 100644 newsdownloader.py rename 08jan2026.txt => patchnotes/08jan2026.txt (100%) rename 25dec2025.txt => patchnotes/25dec2025.txt (100%) create mode 100644 train.ipynb diff --git a/dataset/newsdownloader.py b/dataset/newsdownloader.py new file mode 100644 index 0000000..a04301c --- /dev/null +++ b/dataset/newsdownloader.py @@ -0,0 +1,76 @@ +# the dataset (and probably the model) is excluded from the repo +# this is mainly to protect the work of the contributors +# who painstakingly wrote all the previous patch notes by hand +# this obviously doesn't make it impossible to extract the data yourself but +# hopefully it makes it hard enough for most people to not bother + +import requests +from bs4 import BeautifulSoup + +URL = "https://azurlane.yo-star.com/news/" +MONTHS = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December" +] + +def get_text(date): + response = requests.get(URL + date) + bs = BeautifulSoup(response.text, features="html.parser") + wp = bs.find(id="main") + text = wp.get_text(separator="\n").splitlines() + if any("Oops! That" in line for line in text): + return text + while text[0] != "List of New Contents": + text.pop(0) + while text[-1] != "Friendly Reminders": + text.pop() + text.pop() + + return text + +# ans.txt is the wikitext of all the recent news +# most crucially including dates +lines = open("ans.txt") + +current = [] +for line in map(str.strip, lines): + if line.startswith("==") and any(month in line for month in MONTHS) and len(current): + out = "\n".join(current) + current = [] + + m, d, y = line.replace("=", "").strip().split() + m = MONTHS.index(m) + 1 + while not d[-1].isnumeric(): d = d[:-1] + d = int(d) - 1 + + date = "/".join(map(str, (y, m, d))) + resp = get_text(date) + if "Oops!" in resp: + print("skipping", date) + continue + prompt = "\n".join(resp) + + for a, b in ( + ("–", "-"), + ("’", "'") + ): + prompt = prompt.replace(a, b) + + with open("pairs.py", "a", encoding="utf8") as f: + f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n") + + print(date, end="\r") + + if line.strip(): + current.append(line) + diff --git a/gwen.py b/gwen.py index 6494834..d80ddce 100644 --- a/gwen.py +++ b/gwen.py @@ -1,15 +1,13 @@ -from sys import stderr as err -from sys import argv - from transformers import ( AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, ) +from sys import argv, stderr as err import threading import torch -model_name = "Qwen/Qwen3-8B-FP8" +model_name = "Qwen/Qwen3-8B-FP8" # 1) Choose device (use CUDA if available) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -20,12 +18,13 @@ tokenizer = AutoTokenizer.from_pretrained(model_name) # If GPU and limited VRAM, consider dtype=torch.float16 for half precision model = AutoModelForCausalLM.from_pretrained( - model_name, - dtype=torch.float16 if device.type == "cuda" else None, - device_map=device) + model_name, + dtype=torch.float16 if device.type == "cuda" else None, + device_map=device, +) -print("tokenizer.model_max_length =", tokenizer.model_max_length, file=err) -print("model.config.max_position_embeddings =", model.config.max_position_embeddings, file=err) +print("max_length =", tokenizer.model_max_length, file=err) +print("max_embeds =", model.config.max_position_embeddings, file=err) # 3) Prepare chat inputs (tokenized tensors) if len(argv) > 1: @@ -41,7 +40,7 @@ inputs = tokenizer.apply_chat_template( return_tensors="pt", ) -num_input_tokens = inputs["input_ids"].shape[1] +num_input_tokens = inputs["input_ids"].shape[1] tokens = num_input_tokens print("input tokens =", num_input_tokens, file=err) diff --git a/newsdownloader.py b/newsdownloader.py deleted file mode 100644 index 07e70fc..0000000 --- a/newsdownloader.py +++ /dev/null @@ -1,42 +0,0 @@ -# WIP thingy that downloads official patch notes, starting from 24/12/2025 going one week back at a time -# WIP because patch notes don't always come out on thursdays, which this script just assumes -# the "idea" is to download a ton of patch notes and get the corresponding recent news articles from the wiki -# and train or fine tune some kind of model to do patch notes better - -from datetime import date, timedelta -import requests -from bs4 import BeautifulSoup - -d = date(2025, 12, 24) -URL = "https://azurlane.yo-star.com/news/" - -search = None - -while True: - print(search) - if d == search: - search = d - timedelta(days=7) - d -= timedelta(days=14) - continue - date = f"{d.year}/{d.month}/{d.day}" - print(URL + date) - - response = requests.get(URL + date) - bs = BeautifulSoup(response.text, features="html.parser") - if "Oops!" in bs.text: - if search == None: - search = d - d += timedelta(days=1) - continue - wp = bs.find(id="main") - text = wp.get_text(separator="\n").splitlines() - while text[0] != "List of New Contents": - text.pop(0) - while text[-1] != "Friendly Reminders": - text.pop() - text.pop() - - with open(f"{d.year}-{d.month}-{d.day}.txt", "w", encoding="utf8") as f: - f.write("\n".join(text)) - - d -= timedelta(days=7) diff --git a/08jan2026.txt b/patchnotes/08jan2026.txt similarity index 100% rename from 08jan2026.txt rename to patchnotes/08jan2026.txt diff --git a/25dec2025.txt b/patchnotes/25dec2025.txt similarity index 100% rename from 25dec2025.txt rename to patchnotes/25dec2025.txt diff --git a/train.ipynb b/train.ipynb new file mode 100644 index 0000000..29b58ff --- /dev/null +++ b/train.ipynb @@ -0,0 +1,212 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "01aabcdb", + "metadata": {}, + "source": [ + "import stuff" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "85710d55", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import (\n", + " AutoTokenizer,\n", + " AutoModelForCausalLM,\n", + " TextIteratorStreamer,\n", + ")\n", + "from sys import stderr as err\n", + "import threading\n", + "import torch" + ] + }, + { + "cell_type": "markdown", + "id": "827268e2", + "metadata": {}, + "source": [ + "load model and set max tokens to 131072 (using rope yarn thingy whatever)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f6453597", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using device: cuda\n", + "Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00, 1.95s/it]\n", + "max_length = 131072\n", + "max_embeds = 131072\n" + ] + } + ], + "source": [ + "model_name = \"Qwen/Qwen3-8B-FP8\"\n", + "\n", + "# 1) Choose device (use CUDA if available)\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "print(\"Using device:\", device, file=err)\n", + "\n", + "# 2) Load tokenizer and model\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "\n", + "# If GPU and limited VRAM, consider dtype=torch.float16 for half precision\n", + "model = AutoModelForCausalLM.from_pretrained(\n", + " model_name,\n", + " dtype=torch.float16 if device.type == \"cuda\" else None,\n", + " device_map=device,\n", + ")\n", + "\n", + "print(\"max_length =\", tokenizer.model_max_length, file=err)\n", + "print(\"max_embeds =\", model.config.max_position_embeddings, file=err)" + ] + }, + { + "cell_type": "markdown", + "id": "b1699f5e", + "metadata": {}, + "source": [ + "prep prompt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d78c3dd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "input tokens = 11541\n" + ] + } + ], + "source": [ + "# 3) Prepare chat inputs (tokenized tensors)\n", + "prompt = open(\"prompt\").read().strip()\n", + "messages = [{\"role\": \"user\", \"content\": prompt}]\n", + "inputs = tokenizer.apply_chat_template(\n", + " messages,\n", + " add_generation_prompt=True,\n", + " tokenize=True,\n", + " return_dict=True,\n", + " return_tensors=\"pt\",\n", + ")\n", + "\n", + "num_input_tokens = inputs[\"input_ids\"].shape[1]\n", + "tokens = num_input_tokens\n", + "print(\"input tokens =\", num_input_tokens, file=err)\n", + "\n", + "# Move input tensors to the same device as the model\n", + "inputs = {k: v.to(device) for k, v in inputs.items()}\n", + "\n", + "# 4) Create streamer\n", + "streamer = TextIteratorStreamer(\n", + " tokenizer, \n", + " skip_prompt=True, \n", + " skip_special_tokens=True\n", + ")\n", + "\n", + "# 5) Start generation in background thread (generate is blocking)\n", + "gen_kwargs = dict(\n", + " **inputs,\n", + " max_new_tokens=131072,\n", + " streamer=streamer,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "bc6bf4f8", + "metadata": {}, + "source": [ + "do inference" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ad2f8968", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Okay, let me try to figure out how to transform the given news into the MediaWiki format based on the examples provided. First, I need to understand the structure of the examples to replicate it accurately.\n", + "\n", + "Looking at the first example, the news is structured with a date header, then under \"New Contents\" there are several list items, each starting with a bold title. Each list item has bullet points with specific details. For instance, \"New Chapter\" has subpoints about the chapter availability, obtainable ships, enemy levels, and level caps. Then there's a section for \"System Optimization\" with numbered points.\n", + "\n", + "The second example has more sections, like \"Limited Time Event\" and \"New [Skins]\" with different sub-sections. The third example includes \"New Contents\" with various subcategories like \"New Chapter\", \"New gameplay added\", \"New Character\", \"Augment Update\", \"New Memory\", \"FleetChat Update\", \"CV Update\", and \"System Optimization\". Each of these has specific formatting, such as using ShipDisplay templates with parameters, and sometimes tables for skins or furniture.\n", + "\n", + "Now, the news I need to convert is from January 8, 2026. Let's parse the content step by step.\n", + "\n", + "Starting with the date: \"Posted on January 8, 2026\" becomes \"==January " + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 2\u001b[39m thread.start()\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# 6) Consume and display streamed text in real time\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mtokens\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflush\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/inference/.venv/lib/python3.12/site-packages/transformers/generation/streamers.py:226\u001b[39m, in \u001b[36mTextIteratorStreamer.__next__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 225\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m226\u001b[39m value = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtext_queue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 227\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m value == \u001b[38;5;28mself\u001b[39m.stop_signal:\n\u001b[32m 228\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m()\n", + "\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/queue.py:171\u001b[39m, in \u001b[36mQueue.get\u001b[39m\u001b[34m(self, block, timeout)\u001b[39m\n\u001b[32m 169\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 170\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m--> \u001b[39m\u001b[32m171\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnot_empty\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 172\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout < \u001b[32m0\u001b[39m:\n\u001b[32m 173\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33m'\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m'\u001b[39m\u001b[33m must be a non-negative number\u001b[39m\u001b[33m\"\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/threading.py:355\u001b[39m, in \u001b[36mCondition.wait\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 353\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[32m 354\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m355\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[43m.\u001b[49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 356\u001b[39m gotit = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m 357\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], + "source": [ + "thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)\n", + "thread.start()\n", + "\n", + "# 6) Consume and display streamed text in real time\n", + "for chunk in streamer:\n", + " tokens += len(tokenizer.encode(chunk, add_special_tokens=False))\n", + " print(chunk, end=\"\", flush=True)\n", + " # print(tokens, \"/131072 of token limit\", end=\"\\r\", sep=\"\", file=err)\n", + "print()\n", + "\n", + "thread.join()\n", + "print() # final newline" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}