77 lines
2.0 KiB
Python
77 lines
2.0 KiB
Python
# the dataset (and probably the model) is excluded from the repo
|
||
# this is mainly to protect the work of the contributors
|
||
# who painstakingly wrote all the previous patch notes by hand
|
||
# this obviously doesn't make it impossible to extract the data yourself but
|
||
# hopefully it makes it hard enough for most people to not bother
|
||
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
|
||
URL = "https://azurlane.yo-star.com/news/"
|
||
MONTHS = [
|
||
"January",
|
||
"February",
|
||
"March",
|
||
"April",
|
||
"May",
|
||
"June",
|
||
"July",
|
||
"August",
|
||
"September",
|
||
"October",
|
||
"November",
|
||
"December"
|
||
]
|
||
|
||
def get_text(date):
|
||
response = requests.get(URL + date)
|
||
bs = BeautifulSoup(response.text, features="html.parser")
|
||
wp = bs.find(id="main")
|
||
text = wp.get_text(separator="\n").splitlines()
|
||
if any("Oops! That" in line for line in text):
|
||
return text
|
||
while text[0] != "List of New Contents":
|
||
text.pop(0)
|
||
while text[-1] != "Friendly Reminders":
|
||
text.pop()
|
||
text.pop()
|
||
|
||
return text
|
||
|
||
# ans.txt is the wikitext of all the recent news
|
||
# most crucially including dates
|
||
lines = open("ans.txt")
|
||
|
||
current = []
|
||
for line in map(str.strip, lines):
|
||
if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
|
||
out = "\n".join(current)
|
||
current = []
|
||
|
||
m, d, y = line.replace("=", "").strip().split()
|
||
m = MONTHS.index(m) + 1
|
||
while not d[-1].isnumeric(): d = d[:-1]
|
||
d = int(d) - 1
|
||
|
||
date = "/".join(map(str, (y, m, d)))
|
||
resp = get_text(date)
|
||
if "Oops!" in resp:
|
||
print("skipping", date)
|
||
continue
|
||
prompt = "\n".join(resp)
|
||
|
||
for a, b in (
|
||
("–", "-"),
|
||
("’", "'")
|
||
):
|
||
prompt = prompt.replace(a, b)
|
||
|
||
with open("pairs.py", "a", encoding="utf8") as f:
|
||
f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")
|
||
|
||
print(date, end="\r")
|
||
|
||
if line.strip():
|
||
current.append(line)
|
||
|