# the dataset (and probably the model) is excluded from the repo
# this is mainly to protect the work of the contributors 
# who painstakingly wrote all the previous patch notes by hand
# this obviously doesn't make it impossible to extract the data yourself but
# hopefully it makes it hard enough for most people to not bother

import requests
from bs4 import BeautifulSoup

URL = "https://azurlane.yo-star.com/news/"
MONTHS = [
    "January",
    "February",
    "March",
    "April",
    "May",
    "June",
    "July",
    "August",
    "September",
    "October",
    "November",
    "December"
]

def get_text(date):    
    response = requests.get(URL + date)
    bs = BeautifulSoup(response.text, features="html.parser")
    wp = bs.find(id="main")
    text = wp.get_text(separator="\n").splitlines()
    if any("Oops! That" in line for line in text):
        return text
    while text[0] != "List of New Contents":
        text.pop(0)
    while text[-1] != "Friendly Reminders":
        text.pop()
    text.pop()

    return text

# ans.txt is the wikitext of all the recent news
# most crucially including dates
lines = open("ans.txt")

current = []
for line in map(str.strip, lines):
    if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
        out = "\n".join(current)
        current = []
        
        m, d, y = line.replace("=", "").strip().split()
        m = MONTHS.index(m) + 1
        while not d[-1].isnumeric(): d = d[:-1]
        d = int(d) - 1

        date = "/".join(map(str, (y, m, d)))
        resp = get_text(date)
        if "Oops!" in resp:
            print("skipping", date)
            continue
        prompt = "\n".join(resp)

        for a, b in (
            ("–", "-"),
            ("’", "'")
        ):
            prompt = prompt.replace(a, b)

        with open("pairs.py", "a", encoding="utf8") as f:
            f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")

        print(date, end="\r")

    if line.strip():
        current.append(line)