Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 45 additions & 62 deletions src/eAsisitent_scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ def request_schedule(

:param school_id: The ID of the school you want to get data for
:type school_id: str
:param class_id: The ID of the class you want to get data for, defaults to 0 (optional), defaults to 0 (optional)
:param professor: The ID of the professor you want to get data for, defaults to 0 (optional), defaults to 0 (optional)
:param classroom: The classroom you want to get data for, defaults to 0 (optional), defaults to 0 (optional)
:param interest_activity: The activity you want to get data for, defaults to 0 (optional)
:param school_week: 0 is the current week, 1 is the next week, 2 is the week after that, etc, defaults to 0 (optional)
:param student_id: The ID of the student you want to get the schedule for, defaults to 0 (optional)
:param class_id: The ID of the class you want to get data for, 0 is all classes, defaults to 0 (optional)
:param professor: The ID of the professor you want to get data for, 0 is all professors, defaults to 0 (optional)
:param classroom: The classroom you want to get data for, 0 is all classrooms, defaults to 0 (optional)
:param interest_activity: The activity you want to get data for, 0 is all interest activities, defaults to 0 (optional)
:param school_week: school week that you want to get the data for, 0 is the current week, defaults to 0 (optional)
:param student_id: The ID of the student you want to get the schedule for,0 is all students, defaults to 0 (optional)
:param soup: Return a BeautifulSoup object (optional)
:return: A response object is a requests.models.Response object.

Expand All @@ -47,20 +47,6 @@ def request_schedule(
today = datetime.date.today()


def hour_to_num(hour: str):
"""
Convert hour name to integer

:param hour: the hour that you want to be converted to int
:type hour: str
:return: The hour as an integer.
"""
if hour.lower() == "predura":
return int(0)
else:
return int(hour.split(". ura")[0])


def get_schedule_data(
school_id: str,
class_id=0,
Expand All @@ -76,18 +62,17 @@ def get_schedule_data(

:param school_id: The ID of the school you want to get data for
:type school_id: str
:param class_id: The ID of the class you want to get data for, defaults to 0 (optional), defaults to 0 (optional)
:param professor: The ID of the professor you want to get data for, defaults to 0 (optional), defaults to 0 (optional)
:param classroom: The classroom you want to get data for, defaults to 0 (optional), defaults to 0 (optional)
:param interest_activity: The activity you want to get data for, defaults to 0 (optional)
:param school_week: 0 is the current week, 1 is the next week, 2 is the week after that, etc, defaults to 0 (optional)
:param student_id: The ID of the student you want to get the schedule for, defaults to 0 (optional)
:param class_id: The ID of the class you want to get data for, 0 is all classes, defaults to 0 (optional)
:param professor: The ID of the professor you want to get data for, 0 is all professors, defaults to 0 (optional)
:param classroom: The classroom you want to get data for, 0 is all classrooms, defaults to 0 (optional)
:param interest_activity: The activity you want to get data for, 0 is all interest activities, defaults to 0 (optional)
:param school_week: school week that you want to get the data for, 0 is the current week, defaults to 0 (optional)
:param student_id: The ID of the student you want to get the schedule for,0 is all students, defaults to 0 (optional)
:return: A dictionary with the data.
"""

# TODO: reduce complexity of the function,
# better naming of variables,
# get template for scraped_data from template.json
response = request_schedule(school_id=school_id,
class_id=class_id,
professor=professor,
Expand All @@ -96,64 +81,64 @@ def get_schedule_data(
school_week=school_week,
student_id=student_id)
soup = BeautifulSoup(response.text, "html5lib")
seznam_ur_teden = soup.select("body > table > tbody > tr")
table_rows = soup.select("body > table > tbody > tr")

count: int = -1

dates: list = []
dates_formatted: list = []
hour_times: list = []

scraped_data: dict = {str(i): {str(j): {} for j in range(15)} for i in range(7)}
scraped_data["week_data"] = {"hour_times": [], "dates": [], "current_week": "", "class": ""}
scraped_data: dict = {str(i): {} for i in range(7)}

current_week = int("".join(re.findall("[0-9]", [item.text.split(",")[0] for item in soup.select("body > div > span")][0])))
current_class = str([item.text.strip() for item in soup.select("body > div > strong")][0])

for i in seznam_ur_teden:
for table_row in table_rows:
if count == -1:
for days in i:
for days in table_row:
if type(days) == bs4.element.Tag:
day = days.select("div")
if day[0].text != "Ura":
temp_date = re.findall(r"[^A-z,. ]+", day[1].text)
temp_datetime = datetime.datetime(
day=int(temp_date[0]),
month=int(temp_date[1]),
year=today.year,
)
day=int(temp_date[0]),
month=int(temp_date[1]),
year=today.year,
)
dates_formatted.append(str(temp_datetime.strftime("%Y-%m-%d")))
dates.append(temp_datetime)
if count >= 0:
row = i.find_all("td", class_="ednevnik-seznam_ur_teden-td")
hour_name = row[0].find(class_="text14").text
row = table_row.find_all("td", class_="ednevnik-seznam_ur_teden-td")
hour_name = str(row[0].find(class_="text14").text)
hour_time = row[0].find(class_="text10").text
hour_times.append(hour_time)
hour_num = str(hour_to_num(hour_name))
hour_num = str(hour_num)

count2: int = 0
for block in row:
for row_part in row:
if count2 != 0:
"""Pass the first collum that contains hour times"""
date = dates[count2 - 1]
day_num = str(date.weekday())
date_formatted = str(date.strftime("%Y-%m-%d"))
if "style" not in block.attrs:
scraped_data[day_num].update({str(hour_name): {}})

if "style" not in row_part.attrs:
data_out = {
"subject": None,
"teacher": None,
"classroom": None,
"group": None,
"event": None,
"hour": int(hour_num),
"hour": hour_name,
"week_day": int(day_num),
"hour_in_block": 0,
"date": date_formatted,
}
scraped_data[day_num][hour_num]["0"] = data_out
scraped_data[day_num][hour_name]["0"] = data_out
else:
classes_in_hour = 0
for section in block:
for section in row_part:
if type(section) == bs4.element.Tag:
event = None
subject = None
Expand Down Expand Up @@ -199,10 +184,9 @@ def get_schedule_data(
teacher = teacher_classroom[0]
classroom = teacher_classroom[1]
except IndexError:
pass
pass # Makes it so empty strings don't crash the program
except AttributeError:
"""Makes it so empty strings don't crash the program"""
pass
pass # Makes it so empty strings don't crash the program
if group_raw:
for gr in group_raw:
group.append(gr.text)
Expand All @@ -212,17 +196,17 @@ def get_schedule_data(
section.attrs["id"],
)
):
"""Check for blocks"""
for block_part in section:
if type(block_part) == bs4.element.Tag:
# Check for blocks
for block in section:
if type(block) == bs4.element.Tag:
event = None
subject = None
group_raw = None
group = []
teacher = None
classroom = None
teacher_classroom = None
for img in block_part.select("img"):
for img in block.select("img"):
events_list = {
"Odpadla ura": "cancelled",
"Dogodek": "event",
Expand All @@ -243,15 +227,15 @@ def get_schedule_data(
event = "unknown_event"
try:
subject = (
block_part.find(class_="text14")
block.find(class_="text14")
.text.replace("\n", "")
.replace("\t", "")
)
group_raw = block_part.find_all(
group_raw = block.find_all(
class_="text11 gray bold"
)
teacher_classroom = (
block_part.find(class_="text11")
block.find(class_="text11")
.text.replace("\n", "")
.replace("\t", "")
.replace("\r", "")
Expand All @@ -262,8 +246,7 @@ def get_schedule_data(
except IndexError:
pass
except AttributeError:
"""Makes it so empty strings don't crash the program"""
pass
pass # Makes it so empty strings don't crash the program
if group_raw:
for gr in group_raw:
group.append(gr.text)
Expand All @@ -273,12 +256,12 @@ def get_schedule_data(
"classroom": classroom,
"group": group,
"event": event,
"hour": int(hour_num),
"hour": hour_name,
"week_day": int(day_num),
"hour_in_block": int(classes_in_hour),
"date": date_formatted,
}
scraped_data[day_num][hour_num][
scraped_data[day_num][hour_name][
classes_in_hour
] = data_out
classes_in_hour += 1
Expand All @@ -290,20 +273,20 @@ def get_schedule_data(
"classroom": classroom,
"group": group,
"event": event,
"hour": int(hour_num),
"hour": hour_name,
"week_day": int(day_num),
"hour_in_block": int(classes_in_hour),
"date": date_formatted,
}
scraped_data[day_num][hour_num][
scraped_data[day_num][hour_name][
classes_in_hour
] = data_out
classes_in_hour += 1
count2 += 1
count += 1
scraped_data["week_data"] = {"hour_times": [], "dates": [], "current_week": "", "class": ""}
scraped_data["week_data"]["hour_times"] = hour_times
scraped_data["week_data"]["dates"] = dates_formatted
scraped_data["week_data"]["current_week"] = current_week
scraped_data["week_data"]["class"] = current_class

return scraped_data