moved finished notes to own folder; started work on fine tuning

This commit is contained in:
2026-01-09 04:48:16 +01:00
parent b426fd87d2
commit 750f067b75
6 changed files with 297 additions and 52 deletions

76
dataset/newsdownloader.py Normal file
View File

@@ -0,0 +1,76 @@
# the dataset (and probably the model) is excluded from the repo
# this is mainly to protect the work of the contributors
# who painstakingly wrote all the previous patch notes by hand
# this obviously doesn't make it impossible to extract the data yourself but
# hopefully it makes it hard enough for most people to not bother
import requests
from bs4 import BeautifulSoup
URL = "https://azurlane.yo-star.com/news/"
MONTHS = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
]
def get_text(date):
response = requests.get(URL + date)
bs = BeautifulSoup(response.text, features="html.parser")
wp = bs.find(id="main")
text = wp.get_text(separator="\n").splitlines()
if any("Oops! That" in line for line in text):
return text
while text[0] != "List of New Contents":
text.pop(0)
while text[-1] != "Friendly Reminders":
text.pop()
text.pop()
return text
# ans.txt is the wikitext of all the recent news
# most crucially including dates
lines = open("ans.txt")
current = []
for line in map(str.strip, lines):
if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
out = "\n".join(current)
current = []
m, d, y = line.replace("=", "").strip().split()
m = MONTHS.index(m) + 1
while not d[-1].isnumeric(): d = d[:-1]
d = int(d) - 1
date = "/".join(map(str, (y, m, d)))
resp = get_text(date)
if "Oops!" in resp:
print("skipping", date)
continue
prompt = "\n".join(resp)
for a, b in (
("", "-"),
("", "'")
):
prompt = prompt.replace(a, b)
with open("pairs.py", "a", encoding="utf8") as f:
f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")
print(date, end="\r")
if line.strip():
current.append(line)

11
gwen.py
View File

@@ -1,11 +1,9 @@
from sys import stderr as err
from sys import argv
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TextIteratorStreamer,
)
from sys import argv, stderr as err
import threading
import torch
@@ -22,10 +20,11 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
dtype=torch.float16 if device.type == "cuda" else None,
device_map=device)
device_map=device,
)
print("tokenizer.model_max_length =", tokenizer.model_max_length, file=err)
print("model.config.max_position_embeddings =", model.config.max_position_embeddings, file=err)
print("max_length =", tokenizer.model_max_length, file=err)
print("max_embeds =", model.config.max_position_embeddings, file=err)
# 3) Prepare chat inputs (tokenized tensors)
if len(argv) > 1:

View File

@@ -1,42 +0,0 @@
# WIP thingy that downloads official patch notes, starting from 24/12/2025 going one week back at a time
# WIP because patch notes don't always come out on thursdays, which this script just assumes
# the "idea" is to download a ton of patch notes and get the corresponding recent news articles from the wiki
# and train or fine tune some kind of model to do patch notes better
from datetime import date, timedelta
import requests
from bs4 import BeautifulSoup
d = date(2025, 12, 24)
URL = "https://azurlane.yo-star.com/news/"
search = None
while True:
print(search)
if d == search:
search = d - timedelta(days=7)
d -= timedelta(days=14)
continue
date = f"{d.year}/{d.month}/{d.day}"
print(URL + date)
response = requests.get(URL + date)
bs = BeautifulSoup(response.text, features="html.parser")
if "Oops!" in bs.text:
if search == None:
search = d
d += timedelta(days=1)
continue
wp = bs.find(id="main")
text = wp.get_text(separator="\n").splitlines()
while text[0] != "List of New Contents":
text.pop(0)
while text[-1] != "Friendly Reminders":
text.pop()
text.pop()
with open(f"{d.year}-{d.month}-{d.day}.txt", "w", encoding="utf8") as f:
f.write("\n".join(text))
d -= timedelta(days=7)

212
train.ipynb Normal file
View File

@@ -0,0 +1,212 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "01aabcdb",
"metadata": {},
"source": [
"import stuff"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "85710d55",
"metadata": {},
"outputs": [],
"source": [
"from transformers import (\n",
" AutoTokenizer,\n",
" AutoModelForCausalLM,\n",
" TextIteratorStreamer,\n",
")\n",
"from sys import stderr as err\n",
"import threading\n",
"import torch"
]
},
{
"cell_type": "markdown",
"id": "827268e2",
"metadata": {},
"source": [
"load model and set max tokens to 131072 (using rope yarn thingy whatever)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f6453597",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using device: cuda\n",
"Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00, 1.95s/it]\n",
"max_length = 131072\n",
"max_embeds = 131072\n"
]
}
],
"source": [
"model_name = \"Qwen/Qwen3-8B-FP8\"\n",
"\n",
"# 1) Choose device (use CUDA if available)\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"print(\"Using device:\", device, file=err)\n",
"\n",
"# 2) Load tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
"\n",
"# If GPU and limited VRAM, consider dtype=torch.float16 for half precision\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_name,\n",
" dtype=torch.float16 if device.type == \"cuda\" else None,\n",
" device_map=device,\n",
")\n",
"\n",
"print(\"max_length =\", tokenizer.model_max_length, file=err)\n",
"print(\"max_embeds =\", model.config.max_position_embeddings, file=err)"
]
},
{
"cell_type": "markdown",
"id": "b1699f5e",
"metadata": {},
"source": [
"prep prompt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0d78c3dd",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"input tokens = 11541\n"
]
}
],
"source": [
"# 3) Prepare chat inputs (tokenized tensors)\n",
"prompt = open(\"prompt\").read().strip()\n",
"messages = [{\"role\": \"user\", \"content\": prompt}]\n",
"inputs = tokenizer.apply_chat_template(\n",
" messages,\n",
" add_generation_prompt=True,\n",
" tokenize=True,\n",
" return_dict=True,\n",
" return_tensors=\"pt\",\n",
")\n",
"\n",
"num_input_tokens = inputs[\"input_ids\"].shape[1]\n",
"tokens = num_input_tokens\n",
"print(\"input tokens =\", num_input_tokens, file=err)\n",
"\n",
"# Move input tensors to the same device as the model\n",
"inputs = {k: v.to(device) for k, v in inputs.items()}\n",
"\n",
"# 4) Create streamer\n",
"streamer = TextIteratorStreamer(\n",
" tokenizer, \n",
" skip_prompt=True, \n",
" skip_special_tokens=True\n",
")\n",
"\n",
"# 5) Start generation in background thread (generate is blocking)\n",
"gen_kwargs = dict(\n",
" **inputs,\n",
" max_new_tokens=131072,\n",
" streamer=streamer,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "bc6bf4f8",
"metadata": {},
"source": [
"do inference"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ad2f8968",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<think>\n",
"Okay, let me try to figure out how to transform the given news into the MediaWiki format based on the examples provided. First, I need to understand the structure of the examples to replicate it accurately.\n",
"\n",
"Looking at the first example, the news is structured with a date header, then under \"New Contents\" there are several list items, each starting with a bold title. Each list item has bullet points with specific details. For instance, \"New Chapter\" has subpoints about the chapter availability, obtainable ships, enemy levels, and level caps. Then there's a section for \"System Optimization\" with numbered points.\n",
"\n",
"The second example has more sections, like \"Limited Time Event\" and \"New [Skins]\" with different sub-sections. The third example includes \"New Contents\" with various subcategories like \"New Chapter\", \"New gameplay added\", \"New Character\", \"Augment Update\", \"New Memory\", \"FleetChat Update\", \"CV Update\", and \"System Optimization\". Each of these has specific formatting, such as using ShipDisplay templates with parameters, and sometimes tables for skins or furniture.\n",
"\n",
"Now, the news I need to convert is from January 8, 2026. Let's parse the content step by step.\n",
"\n",
"Starting with the date: \"Posted on January 8, 2026\" becomes \"==January "
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[7]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m 2\u001b[39m thread.start()\n\u001b[32m 4\u001b[39m \u001b[38;5;66;03m# 6) Consume and display streamed text in real time\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;43;01mfor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01min\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[43m:\u001b[49m\n\u001b[32m 6\u001b[39m \u001b[43m \u001b[49m\u001b[43mtokens\u001b[49m\u001b[43m \u001b[49m\u001b[43m+\u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mencode\u001b[49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43madd_special_tokens\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 7\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mprint\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mchunk\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend\u001b[49m\u001b[43m=\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mflush\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/inference/.venv/lib/python3.12/site-packages/transformers/generation/streamers.py:226\u001b[39m, in \u001b[36mTextIteratorStreamer.__next__\u001b[39m\u001b[34m(self)\u001b[39m\n\u001b[32m 225\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__next__\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m226\u001b[39m value = \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtext_queue\u001b[49m\u001b[43m.\u001b[49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 227\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m value == \u001b[38;5;28mself\u001b[39m.stop_signal:\n\u001b[32m 228\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m()\n",
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/queue.py:171\u001b[39m, in \u001b[36mQueue.get\u001b[39m\u001b[34m(self, block, timeout)\u001b[39m\n\u001b[32m 169\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 170\u001b[39m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m._qsize():\n\u001b[32m--> \u001b[39m\u001b[32m171\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mnot_empty\u001b[49m\u001b[43m.\u001b[49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 172\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m timeout < \u001b[32m0\u001b[39m:\n\u001b[32m 173\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33m'\u001b[39m\u001b[33mtimeout\u001b[39m\u001b[33m'\u001b[39m\u001b[33m must be a non-negative number\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m/usr/lib/python3.12/threading.py:355\u001b[39m, in \u001b[36mCondition.wait\u001b[39m\u001b[34m(self, timeout)\u001b[39m\n\u001b[32m 353\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[32m 354\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m355\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[43m.\u001b[49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 356\u001b[39m gotit = \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[32m 357\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n",
"\u001b[31mKeyboardInterrupt\u001b[39m: "
]
}
],
"source": [
"thread = threading.Thread(target=model.generate, kwargs=gen_kwargs)\n",
"thread.start()\n",
"\n",
"# 6) Consume and display streamed text in real time\n",
"for chunk in streamer:\n",
" tokens += len(tokenizer.encode(chunk, add_special_tokens=False))\n",
" print(chunk, end=\"\", flush=True)\n",
" # print(tokens, \"/131072 of token limit\", end=\"\\r\", sep=\"\", file=err)\n",
"print()\n",
"\n",
"thread.join()\n",
"print() # final newline"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}