# the dataset (and probably the model) is excluded from the repo # this is mainly to protect the work of the contributors # who painstakingly wrote all the previous patch notes by hand # this obviously doesn't make it impossible to extract the data yourself but # hopefully it makes it hard enough for most people to not bother import requests from bs4 import BeautifulSoup URL = "https://azurlane.yo-star.com/news/" MONTHS = [ "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December" ] def get_text(date): response = requests.get(URL + date) bs = BeautifulSoup(response.text, features="html.parser") wp = bs.find(id="main") text = wp.get_text(separator="\n").splitlines() if any("Oops! That" in line for line in text): return text while text[0] != "List of New Contents": text.pop(0) while text[-1] != "Friendly Reminders": text.pop() text.pop() return text # ans.txt is the wikitext of all the recent news # most crucially including dates lines = open("ans.txt") current = [] for line in map(str.strip, lines): if line.startswith("==") and any(month in line for month in MONTHS) and len(current): out = "\n".join(current) current = [] m, d, y = line.replace("=", "").strip().split() m = MONTHS.index(m) + 1 while not d[-1].isnumeric(): d = d[:-1] d = int(d) - 1 date = "/".join(map(str, (y, m, d))) resp = get_text(date) if "Oops!" in resp: print("skipping", date) continue prompt = "\n".join(resp) for a, b in ( ("–", "-"), ("’", "'") ): prompt = prompt.replace(a, b) with open("pairs.py", "a", encoding="utf8") as f: f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n") print(date, end="\r") if line.strip(): current.append(line)