moved finished notes to own folder; started work on fine tuning

This commit is contained in:
2026-01-09 04:48:16 +01:00
parent b426fd87d2
commit 750f067b75
6 changed files with 297 additions and 52 deletions

76
dataset/newsdownloader.py Normal file
View File

@@ -0,0 +1,76 @@
# the dataset (and probably the model) is excluded from the repo
# this is mainly to protect the work of the contributors
# who painstakingly wrote all the previous patch notes by hand
# this obviously doesn't make it impossible to extract the data yourself but
# hopefully it makes it hard enough for most people to not bother
import requests
from bs4 import BeautifulSoup
URL = "https://azurlane.yo-star.com/news/"
MONTHS = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
]
def get_text(date):
response = requests.get(URL + date)
bs = BeautifulSoup(response.text, features="html.parser")
wp = bs.find(id="main")
text = wp.get_text(separator="\n").splitlines()
if any("Oops! That" in line for line in text):
return text
while text[0] != "List of New Contents":
text.pop(0)
while text[-1] != "Friendly Reminders":
text.pop()
text.pop()
return text
# ans.txt is the wikitext of all the recent news
# most crucially including dates
lines = open("ans.txt")
current = []
for line in map(str.strip, lines):
if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
out = "\n".join(current)
current = []
m, d, y = line.replace("=", "").strip().split()
m = MONTHS.index(m) + 1
while not d[-1].isnumeric(): d = d[:-1]
d = int(d) - 1
date = "/".join(map(str, (y, m, d)))
resp = get_text(date)
if "Oops!" in resp:
print("skipping", date)
continue
prompt = "\n".join(resp)
for a, b in (
("", "-"),
("", "'")
):
prompt = prompt.replace(a, b)
with open("pairs.py", "a", encoding="utf8") as f:
f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")
print(date, end="\r")
if line.strip():
current.append(line)