moved finished notes to own folder; started work on fine tuning
This commit is contained in:
76
dataset/newsdownloader.py
Normal file
76
dataset/newsdownloader.py
Normal file
@@ -0,0 +1,76 @@
|
||||
# the dataset (and probably the model) is excluded from the repo
|
||||
# this is mainly to protect the work of the contributors
|
||||
# who painstakingly wrote all the previous patch notes by hand
|
||||
# this obviously doesn't make it impossible to extract the data yourself but
|
||||
# hopefully it makes it hard enough for most people to not bother
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
URL = "https://azurlane.yo-star.com/news/"
|
||||
MONTHS = [
|
||||
"January",
|
||||
"February",
|
||||
"March",
|
||||
"April",
|
||||
"May",
|
||||
"June",
|
||||
"July",
|
||||
"August",
|
||||
"September",
|
||||
"October",
|
||||
"November",
|
||||
"December"
|
||||
]
|
||||
|
||||
def get_text(date):
|
||||
response = requests.get(URL + date)
|
||||
bs = BeautifulSoup(response.text, features="html.parser")
|
||||
wp = bs.find(id="main")
|
||||
text = wp.get_text(separator="\n").splitlines()
|
||||
if any("Oops! That" in line for line in text):
|
||||
return text
|
||||
while text[0] != "List of New Contents":
|
||||
text.pop(0)
|
||||
while text[-1] != "Friendly Reminders":
|
||||
text.pop()
|
||||
text.pop()
|
||||
|
||||
return text
|
||||
|
||||
# ans.txt is the wikitext of all the recent news
|
||||
# most crucially including dates
|
||||
lines = open("ans.txt")
|
||||
|
||||
current = []
|
||||
for line in map(str.strip, lines):
|
||||
if line.startswith("==") and any(month in line for month in MONTHS) and len(current):
|
||||
out = "\n".join(current)
|
||||
current = []
|
||||
|
||||
m, d, y = line.replace("=", "").strip().split()
|
||||
m = MONTHS.index(m) + 1
|
||||
while not d[-1].isnumeric(): d = d[:-1]
|
||||
d = int(d) - 1
|
||||
|
||||
date = "/".join(map(str, (y, m, d)))
|
||||
resp = get_text(date)
|
||||
if "Oops!" in resp:
|
||||
print("skipping", date)
|
||||
continue
|
||||
prompt = "\n".join(resp)
|
||||
|
||||
for a, b in (
|
||||
("–", "-"),
|
||||
("’", "'")
|
||||
):
|
||||
prompt = prompt.replace(a, b)
|
||||
|
||||
with open("pairs.py", "a", encoding="utf8") as f:
|
||||
f.write("{" + f'"input": """{prompt}""", \n"output": """{out}"""' + "},\n")
|
||||
|
||||
print(date, end="\r")
|
||||
|
||||
if line.strip():
|
||||
current.append(line)
|
||||
|
||||
Reference in New Issue
Block a user