moved finished notes to own folder; started work on fine tuning

2026-01-09 04:48:16 +01:00
parent b426fd87d2
commit 750f067b75
6 changed files with 297 additions and 52 deletions
@@ -0,0 +1,76 @@
+# the dataset (and probably the model) is excluded from the repo
+# this is mainly to protect the work of the contributors 
+# who painstakingly wrote all the previous patch notes by hand
+# this obviously doesn't make it impossible to extract the data yourself but
+# hopefully it makes it hard enough for most people to not bother
+
+import requests
+from bs4 import BeautifulSoup
+
+URL = "https://azurlane.yo-star.com/news/"
+MONTHS = [
+    "January",
+    "February",
+    "March",
+    "April",
+    "May",
+    "June",
+    "July",
+    "August",
+    "September",
+    "October",
+    "November",
+    "December"
+]
+
+def get_text(date):    
+    response = requests.get(URL + date)
+    bs = BeautifulSoup(response.text, features="html.parser")
+    wp = bs.find(id="main")
+    text = wp.get_text(separator="\n").splitlines()
+    if any("Oops! That" in line for line in text):
+        return text
+    while text[0] != "List of New Contents":
+        text.pop(0)
+    while text[-1] != "Friendly Reminders":
+        text.pop()
+    text.pop()
+
+    return text
+
+# ans.txt is the wikitext of all the recent news
+# most crucially including dates
+lines = open("ans.txt")
+
+current = []
+for line in map(str.strip, lines):
+    if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
+        out = "\n".join(current)
+        current = []
+        
+        m, d, y = line.replace("=", "").strip().split()
+        m = MONTHS.index(m) + 1
+        while not d[-1].isnumeric(): d = d[:-1]
+        d = int(d) - 1
+
+        date = "/".join(map(str, (y, m, d)))
+        resp = get_text(date)
+        if "Oops!" in resp:
+            print("skipping", date)
+            continue
+        prompt = "\n".join(resp)
+
+        for a, b in (
+            ("–", "-"),
+            ("’", "'")
+        ):
+            prompt = prompt.replace(a, b)
+
+        with open("pairs.py", "a", encoding="utf8") as f:
+            f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")
+
+        print(date, end="\r")
+
+    if line.strip():
+        current.append(line)
+
@@ -1,11 +1,9 @@
-from sys import stderr as err
-from sys import argv
-
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
 )
+from sys import argv, stderr as err
 import threading
 import torch

@@ -22,10 +20,11 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16 if device.type == "cuda" else None,
-    device_map=device)
+    device_map=device,
+)

-print("tokenizer.model_max_length =", tokenizer.model_max_length, file=err)
-print("model.config.max_position_embeddings =", model.config.max_position_embeddings, file=err)
+print("max_length =", tokenizer.model_max_length, file=err)
+print("max_embeds =", model.config.max_position_embeddings, file=err)

 # 3) Prepare chat inputs (tokenized tensors)
 if len(argv) > 1:
@@ -1,42 +0,0 @@
-# WIP thingy that downloads official patch notes, starting from 24/12/2025 going one week back at a time
-# WIP because patch notes don't always come out on thursdays, which this script just assumes
-# the "idea" is to download a ton of patch notes and get the corresponding recent news articles from the wiki
-# and train or fine tune some kind of model to do patch notes better
-
-from datetime import date, timedelta
-import requests
-from bs4 import BeautifulSoup
-
-d = date(2025, 12, 24)
-URL = "https://azurlane.yo-star.com/news/"
-
-search = None
-
-while True:
-    print(search)
-    if d == search:
-        search = d - timedelta(days=7)
-        d -= timedelta(days=14)
-        continue
-    date = f"{d.year}/{d.month}/{d.day}"
-    print(URL + date)
-
-    response = requests.get(URL + date)
-    bs = BeautifulSoup(response.text, features="html.parser")
-    if "Oops!" in bs.text:
-        if search == None:
-            search = d
-        d += timedelta(days=1)
-        continue
-    wp = bs.find(id="main")
-    text = wp.get_text(separator="\n").splitlines()
-    while text[0] != "List of New Contents":
-        text.pop(0)
-    while text[-1] != "Friendly Reminders":
-        text.pop()
-    text.pop()
-    
-    with open(f"{d.year}-{d.month}-{d.day}.txt", "w", encoding="utf8") as f:
-        f.write("\n".join(text))
-
-    d -= timedelta(days=7)
@@ -0,0 +1,212 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "01aabcdb",
+   "metadata": {},
+   "source": [
+    "import stuff"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "85710d55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import (\n",
+    "    AutoTokenizer,\n",
+    "    AutoModelForCausalLM,\n",
+    "    TextIteratorStreamer,\n",
+    ")\n",
+    "from sys import stderr as err\n",
+    "import threading\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "827268e2",
+   "metadata": {},
+   "source": [
+    "load model and set max tokens to 131072 (using rope yarn thingy whatever)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f6453597",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n",
+      "Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.95s/it]\n",
+      "max_length = 131072\n",
+      "max_embeds = 131072\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"Qwen/Qwen3-8B-FP8\"\n",
+    "\n",
+    "# 1) Choose device (use CUDA if available)\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "print(\"Using device:\", device, file=err)\n",
+    "\n",
+    "# 2) Load tokenizer and model\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "\n",
+    "# If GPU and limited VRAM, consider dtype=torch.float16 for half precision\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    dtype=torch.float16 if device.type == \"cuda\" else None,\n",
+    "    device_map=device,\n",
+    ")\n",
+    "\n",
+    "print(\"max_length =\", tokenizer.model_max_length, file=err)\n",
+    "print(\"max_embeds =\", model.config.max_position_embeddings, file=err)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b1699f5e",
+   "metadata": {},
+   "source": [
+    "prep prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d78c3dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "input tokens = 11541\n"
+     ]
+    }
+   ],
+   "source": [
+    "# 3) Prepare chat inputs (tokenized tensors)\n",
+    "prompt = open(\"prompt\").read().strip()\n",
+    "messages = [{\"role\": \"user\", \"content\": prompt}]\n",
+    "inputs = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    add_generation_prompt=True,\n",
+    "    tokenize=True,\n",
+    "    return_dict=True,\n",
+    "    return_tensors=\"pt\",\n",
+    ")\n",
+    "\n",
+    "num_input_tokens = inputs[\"input_ids\"].shape[1]\n",
+    "tokens = num_input_tokens\n",
+    "print(\"input tokens =\", num_input_tokens, file=err)\n",
+    "\n",
+    "# Move input tensors to the same device as the model\n",
+    "inputs = {k: v.to(device) for k, v in inputs.items()}\n",
+    "\n",
+    "# 4) Create streamer\n",
+    "streamer = TextIteratorStreamer(\n",
+    "    tokenizer, \n",
+    "    skip_prompt=True, \n",
+    "    skip_special_tokens=True\n",
+    ")\n",
+    "\n",
+    "# 5) Start generation in background thread (generate is blocking)\n",
+    "gen_kwargs = dict(\n",
+    "    **inputs,\n",
+    "    max_new_tokens=131072,\n",
+    "    streamer=streamer,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bc6bf4f8",
+   "metadata": {},
+   "source": [
+    "do inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "ad2f8968",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think>\n",
+      "Okay, let me try to figure out how to transform the given news into the MediaWiki format based on the examples provided. First, I need to understand the structure of the examples to replicate it accurately.\n",
+      "\n",
+      "Looking at the first example, the news is structured with a date header, then under \"New Contents\" there are several list items, each starting with a bold title. Each list item has bullet points with specific details. For instance, \"New Chapter\" has subpoints about the chapter availability, obtainable ships, enemy levels, and level caps. Then there's a section for \"System Optimization\" with numbered points.\n",
+      "\n",
+      "The second example has more sections, like \"Limited Time Event\" and \"New [Skins]\" with different sub-sections. The third example includes \"New Contents\" with various subcategories like \"New Chapter\", \"New gameplay added\", \"New Character\", \"Augment Update\", \"New Memory\", \"FleetChat Update\", \"CV Update\", and \"System Optimization\". Each of these has specific formatting, such as using ShipDisplay templates with parameters, and sometimes tables for skins or furniture.\n",
+      "\n",
+      "Now, the news I need to convert is from January 8, 2026. Let's parse the content step by step.\n",
+      "\n",
+      "Starting with the date: \"Posted on January 8, 2026\" becomes \"==January "
+     ]
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mKeyboardInterrupt\u001b[39m                         Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m      2\u001b[39m thread.start()\n\u001b[32m      4\u001b[39m \u001b[38;5;66;03m# 6) Consume and display streamed text in real time\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m      6\u001b[39m \u001b[43m    \u001b[49m\u001b[43mtokens\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      7\u001b[39m \u001b[43m    \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflush\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m~/inference/.venv/lib/python3.12/site-packages/transformers/generation/streamers.py:226\u001b[39m, in \u001b[36mTextIteratorStreamer.__next__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m    225\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m226\u001b[39m     value = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtext_queue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    227\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m value == \u001b[38;5;28mself\u001b[39m.stop_signal:\n\u001b[32m    228\u001b[39m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m()\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/queue.py:171\u001b[39m, in \u001b[36mQueue.get\u001b[39m\u001b[34m(self, block, timeout)\u001b[39m\n\u001b[32m    169\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m    170\u001b[39m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m--> \u001b[39m\u001b[32m171\u001b[39m         \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnot_empty\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    172\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout < \u001b[32m0\u001b[39m:\n\u001b[32m    173\u001b[39m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33m'\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m'\u001b[39m\u001b[33m must be a non-negative number\u001b[39m\u001b[33m\"\u001b[39m)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/threading.py:355\u001b[39m, in \u001b[36mCondition.wait\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m    353\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:    \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[32m    354\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m355\u001b[39m         \u001b[43mwaiter\u001b[49m\u001b[43m.\u001b[49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    356\u001b[39m         gotit = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m    357\u001b[39m     \u001b[38;5;28;01melse\u001b[39;00m:\n",
+      "\u001b[31mKeyboardInterrupt\u001b[39m: "
+     ]
+    }
+   ],
+   "source": [
+    "thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)\n",
+    "thread.start()\n",
+    "\n",
+    "# 6) Consume and display streamed text in real time\n",
+    "for chunk in streamer:\n",
+    "    tokens += len(tokenizer.encode(chunk, add_special_tokens=False))\n",
+    "    print(chunk, end=\"\", flush=True)\n",
+    "    # print(tokens, \"/131072 of token limit\", end=\"\\r\", sep=\"\", file=err)\n",
+    "print()\n",
+    "\n",
+    "thread.join()\n",
+    "print()  # final newline"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}