Files
patchnotes/dataset/newsdownloader.py

77 lines
2.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# the dataset (and probably the model) is excluded from the repo
# this is mainly to protect the work of the contributors
# who painstakingly wrote all the previous patch notes by hand
# this obviously doesn't make it impossible to extract the data yourself but
# hopefully it makes it hard enough for most people to not bother
import requests
from bs4 import BeautifulSoup
URL = "https://azurlane.yo-star.com/news/"
MONTHS = [
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December"
]
def get_text(date):
response = requests.get(URL + date)
bs = BeautifulSoup(response.text, features="html.parser")
wp = bs.find(id="main")
text = wp.get_text(separator="\n").splitlines()
if any("Oops! That" in line for line in text):
return text
while text[0] != "List of New Contents":
text.pop(0)
while text[-1] != "Friendly Reminders":
text.pop()
text.pop()
return text
# ans.txt is the wikitext of all the recent news
# most crucially including dates
lines = open("ans.txt")
current = []
for line in map(str.strip, lines):
if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
out = "\n".join(current)
current = []
m, d, y = line.replace("=", "").strip().split()
m = MONTHS.index(m) + 1
while not d[-1].isnumeric(): d = d[:-1]
d = int(d) - 1
date = "/".join(map(str, (y, m, d)))
resp = get_text(date)
if "Oops!" in resp:
print("skipping", date)
continue
prompt = "\n".join(resp)
for a, b in (
("", "-"),
("", "'")
):
prompt = prompt.replace(a, b)
with open("pairs.py", "a", encoding="utf8") as f:
f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")
print(date, end="\r")
if line.strip():
current.append(line)