diff --git a/.gitignore b/.gitignore index b7c90b8..3d5eaac 100644 --- a/.gitignore +++ b/.gitignore @@ -13,9 +13,9 @@ Pipfile.lock # Other random_code/ dist/ -src/eAsistent_scraper.egg-info/ +*.egg-info git_hidden/ # Not done yet tests/ -src/eAsisitent_scraper/pharser.py +src/eAsisitentAPI/pharser.py diff --git a/README.md b/README.md index 217998c..3a3ec6e 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,6 @@ # eAsistent scraper (WIP) -*** -Scrapes data from *easistent.com/urniki/...* and returns it as Python dictionary -*** +## EN To install it run: pip install eAsistent-scraper @@ -11,11 +9,13 @@ To install it run: Example usage: ```python -import eAsisitent_scraper +import eAsisitentAPI -data = eAsisitent_scraper.get_schedule_data(school_id="SCHOOL_ID", class_id="CLASS_ID") +data = eAsisitentAPI.get_schedule(school_id="SCHOOL_ID", class_id=CLASS_ID) ``` *** For support and feature requests ask on [GitHub discussions](https://github.com/PingWasFun/eAsistent-scraper/discussions/categories/general) -To contribute fork the GitHub repository and make a pull request. +## SLO + +// TODO \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 8082450..feac667 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,15 +1,15 @@ [metadata] -name = eAsistent_scraper -version = 1.5.1 +name = eAsistentAPI +version = 2.0.0 author = PingIsFun author_email = pingisfun@protonmail.com description = Scrapes data from easistent.com/urniki/... and returns it as Python dictionary long_description = file: README.md long_description_content_type = text/markdown -url = https://github.com/PingWasFun/eAsistent-scraper +url = https://github.com/PingIsFun/eAsistentAPI project_urls = - Bug Tracker = https://github.com/PingWasFun/eAsistent-scraper/issues - Help = https://github.com/PingWasFun/eAsistent-scraper/discussions/categories/general + Bug Tracker = https://github.com/PingIsFun/eAsistentAPI/issues + Help = https://github.com/PingIsFun/eAsistentAPI/discussions/categories/general license = MIT License platform = any classifiers = @@ -33,7 +33,7 @@ install_requires = package_dir = = src packages = find: -python_requires = >=3.7 +python_requires = >=3.10 [options.packages.find] where = src diff --git a/src/eAsisitentAPI/__init__.py b/src/eAsisitentAPI/__init__.py new file mode 100644 index 0000000..ca87a63 --- /dev/null +++ b/src/eAsisitentAPI/__init__.py @@ -0,0 +1,2 @@ +from .scraper import get_schedule + diff --git a/src/eAsisitentAPI/scraper.py b/src/eAsisitentAPI/scraper.py new file mode 100644 index 0000000..abeb904 --- /dev/null +++ b/src/eAsisitentAPI/scraper.py @@ -0,0 +1,250 @@ +import bs4.element +import datetime +import re +import requests +import time +from dataclasses import dataclass + +from bs4 import BeautifulSoup +from requests import Response + + +@dataclass() +class Formatting: + SUBJECT_CLASS = "text14" + RAW_GROUP_CLASS = "text11 gray bold" + TEACHER_CLASSROOM_CLASS = "text11" + EVENT_CLASS = "text14" + CLASS_NAME_CLASS = "text20" + + +@dataclass() +class HourBlock: + subject: str + teacher: str + classroom: str + group: list[str] + event: str + hour: str + hour_in_block: int + date: datetime.date + + +@dataclass() +class Hour: + name: str + hour_blocks: list[HourBlock] + + +@dataclass() +class SchoolDay: + date: datetime + hours: list[Hour] + + +@dataclass() +class UsedData: + school_id: str + class_id: int + professor: int + classroom: int + interest_activity: int + school_week: int + student_id: int + + +@dataclass() +class Schedule: + days: list[SchoolDay] + hour_times: list[str] + dates: list[datetime.date] + class_name: str + request_week: int + request_epoch: int + used_data: UsedData + + +def __get_hour_data(section: bs4.element.Tag) -> tuple[str, list, str, str]: + subject = section.find(class_=Formatting.SUBJECT_CLASS).text.replace("\n", "").replace("\t", "") + group_raw = section.find_all(class_=Formatting.RAW_GROUP_CLASS) + try: + teacher_classroom = list( + section.find(class_=Formatting.TEACHER_CLASSROOM_CLASS) + .text.replace("\n", "") + .replace("\t", "") + .replace("\r", "") + .split(", ") + ) + except AttributeError: + subject = section.find(class_=Formatting.EVENT_CLASS).text.replace("\n", "").replace("\t", "") + teacher_classroom = [None, None] + group = [x.text for x in group_raw] + group = None if group == [] else group + return subject, group, teacher_classroom[0], teacher_classroom[1] + + +def __get_event(section: bs4.element.Tag) -> str: + for img in section.select("img"): + return img.attrs["title"] + + +def __make_data_out( + date: datetime.date, + subject: str = None, + teacher: str = None, + classroom: str = None, + group: list = None, + event: str = None, + hour_name: str = None, + week_day: str = None, + hour_in_block: int = None, +) -> HourBlock: + return HourBlock(subject, teacher, classroom, group, event, hour_name, hour_in_block, date) + + +def __format_date(date: datetime.date) -> str: + return str(date.strftime("%Y-%m-%d")) + + +def __get_dates(table_row: bs4.element.Tag) -> list[datetime.date]: + dates: list = [] + for days in table_row: + if type(days) == bs4.element.Tag: + day = days.select("div") + if day[0].text != "Ura": + temp_date = re.findall(r"[^A-z,. ]+", day[1].text) + temp_datetime = datetime.date( + day=int(temp_date[0]), + month=int(temp_date[1]), + year=datetime.date.today().year, + ) + dates.append(temp_datetime) + return dates + + +def __get_hours_time_data(row: bs4.element.ResultSet) -> tuple[str, str]: + hour_name = str(row[0].find(class_="text14").text) + hour_time = str(row[0].find(class_="text10").text.replace(" ", "")) + return hour_name, hour_time + + +def __request_schedule( + school_id: str, + class_id=0, + professor=0, + classroom=0, + interest_activity=0, + school_week=0, + student_id=0, +) -> Response: + url = f"https://www.easistent.com/urniki/izpis/{school_id}/{class_id}/{professor}/{classroom}/{interest_activity}/{school_week}/{student_id}" + + response = requests.get(url) + + if response.text == "Šola ni veljavna!" or response.text == "Šola ni izbrana!": + raise ValueError("This school does not exist. school_id is invalid") + return response + + +def get_schedule( + school_id: str, + class_id=0, + professor=0, + classroom=0, + interest_activity=0, + school_week=0, + student_id=0, +) -> Schedule: + response = __request_schedule( + school_id=school_id, + class_id=class_id, + professor=professor, + classroom=classroom, + interest_activity=interest_activity, + school_week=school_week, + student_id=student_id, + ) + + request_time = int(time.time()) + + soup = BeautifulSoup(response.text, "html5lib") + table_rows = soup.select("body > table > tbody > tr") + + hour_times: list = [] + dates: list[datetime.date] = [] + + current_week = int( + "".join( + re.findall( + "[0-9]", + [item.text.split(",")[0] for item in + soup.select("body > div > span")][ + 0 + ], + ) + ) + ) + class_name = str( + [item.text.strip() for item in soup.select("body > div > strong")][0] + ) + final_bundle_pre_turn = [] + for count, table_row in enumerate(table_rows): + bundle_hour: list[Hour] = [] + if count == 0: + dates = __get_dates(table_row) + continue + + row = table_row.find_all("td", + class_="ednevnik-seznam_ur_teden-td") + hour_name, hour_time = __get_hours_time_data(row) + hour_times.append(hour_time) + for count2, row_part in enumerate(row): + if count2 == 0: + continue + bundle_hour_block = Hour(hour_name, []) + """Pass the first column that contains hour times""" + date = dates[count2 - 1] + day_num = str(date.weekday()) + if "style" not in row_part.attrs: # Detect empty hours + data_out = __make_data_out(date, hour_name=hour_name, week_day=day_num, hour_in_block=0) + bundle_hour_block.hour_blocks.append(data_out) + else: + classes_in_hour = 0 + for section in row_part: + if type(section) != bs4.element.Tag: + continue + event = __get_event(section) + subject, group, teacher, hour_classroom = __get_hour_data(section) + + is_block_hour = ("id" in section.attrs) and bool( + re.match( + r"ednevnik-seznam_ur_teden-blok" + r"-\d\d\d\d\d\d-\d\d\d\d-\d\d-\d\d", + section.attrs["id"], + ) + ) + + if not is_block_hour: + data_out = __make_data_out( + date, subject, teacher, hour_classroom, group, event, hour_name, day_num, classes_in_hour + ) + bundle_hour_block.hour_blocks.append(data_out) + classes_in_hour += 1 + continue + + for block in section: + if type(block) != bs4.element.Tag: + continue + event = __get_event(section) + subject, group, teacher, hour_classroom = __get_hour_data(section) + data_out = __make_data_out( + date, subject, teacher, hour_classroom, group, event, hour_name, day_num, classes_in_hour + ) + bundle_hour_block.hour_blocks.append(data_out) + classes_in_hour += 1 + + bundle_hour.append(bundle_hour_block) + final_bundle_pre_turn.append(bundle_hour) + school_days_list = [SchoolDay(dates[index], list(x)) for index, x in enumerate(list(zip(*final_bundle_pre_turn)))] + used_data = UsedData(school_id, class_id, professor, classroom, interest_activity, school_week, student_id) + return Schedule(school_days_list, hour_times, dates, class_name, current_week, request_time, used_data) diff --git a/src/eAsisitent_scraper/__init__.py b/src/eAsisitent_scraper/__init__.py deleted file mode 100644 index d14f304..0000000 --- a/src/eAsisitent_scraper/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .scraper import get_schedule_data - diff --git a/src/eAsisitent_scraper/scraper.py b/src/eAsisitent_scraper/scraper.py deleted file mode 100644 index 6ccd244..0000000 --- a/src/eAsisitent_scraper/scraper.py +++ /dev/null @@ -1,333 +0,0 @@ -import bs4.element -import datetime -import re -import requests -import time - -from bs4 import BeautifulSoup - - -def request_schedule( - school_id: str, - class_id=0, - professor=0, - classroom=0, - interest_activity=0, - school_week=0, - student_id=0, - soup=False, -): - """ - It requests schedule from easistent.com and returns it as a response - - :param school_id: The ID of the school you want to get data for - :type school_id: str - :param class_id: The ID of the class you want to get data for, 0 is all classes, defaults to 0 (optional) - :param professor: The ID of the professor you want to get data for, 0 is all professors, defaults to 0 (optional) - :param classroom: The classroom you want to get data for, 0 is all classrooms, defaults to 0 (optional) - :param interest_activity: The activity you want to get data for, 0 is all interest activities, defaults to 0 (optional) - :param school_week: school week that you want to get the data for, 0 is the current week, defaults to 0 (optional) - :param student_id: The ID of the student you want to get the schedule for,0 is all students, defaults to 0 (optional) - :param soup: Return a BeautifulSoup object (optional) - :return: A response object is a requests.models.Response object. - - - """ - - url = f"https://www.easistent.com/urniki/izpis/{school_id}/{class_id}/{professor}/{classroom}/{interest_activity}/{school_week}/{student_id}" - - response = requests.get(url) - - if response.text == "Šola ni veljavna!" or response.text == "Šola ni izbrana!": - raise ValueError("This school does not exist. school_id is invalid") - if soup: - return BeautifulSoup(response.text, "html5lib") - return response - - -today = datetime.date.today() - - -def get_schedule_data( - school_id: str, - class_id=0, - professor=0, - classroom=0, - interest_activity=0, - school_week=0, - student_id=0, -): - """ - Date format is: YYYY-MM-DD - If school id is invalid ValueError is raised - - :param school_id: The ID of the school you want to get data for - :type school_id: str - :param class_id: The ID of the class you want to get data for, 0 is all classes, defaults to 0 (optional) - :param professor: The ID of the professor you want to get data for, 0 is all professors, defaults to 0 (optional) - :param classroom: The classroom you want to get data for, 0 is all classrooms, defaults to 0 (optional) - :param interest_activity: The activity you want to get data for, 0 is all interest activities, defaults to 0 (optional) - :param school_week: school week that you want to get the data for, 0 is the current week, defaults to 0 (optional) - :param student_id: The ID of the student you want to get the schedule for,0 is all students, defaults to 0 (optional) - :return: A dictionary with the data. - """ - - # TODO: reduce complexity of the function, - # better naming of variables, - response = request_schedule( - school_id=school_id, - class_id=class_id, - professor=professor, - classroom=classroom, - interest_activity=interest_activity, - school_week=school_week, - student_id=student_id, - ) - - request_time = int(time.time()) - - soup = BeautifulSoup(response.text, "html5lib") - table_rows = soup.select("body > table > tbody > tr") - - count: int = -1 - - dates: list = [] - dates_formatted: list = [] - hour_times: list = [] - - scraped_data: dict = {} - - current_week = int( - "".join( - re.findall( - "[0-9]", - [item.text.split(",")[0] for item in - soup.select("body > div > span")][ - 0 - ], - ) - ) - ) - current_class = str( - [item.text.strip() for item in soup.select("body > div > strong")][0] - ) - - for table_row in table_rows: - if count == -1: - for days in table_row: - if type(days) == bs4.element.Tag: - day = days.select("div") - if day[0].text != "Ura": - temp_date = re.findall(r"[^A-z,. ]+", day[1].text) - temp_datetime = datetime.datetime( - day=int(temp_date[0]), - month=int(temp_date[1]), - year=today.year, - ) - dates_formatted.append( - str(temp_datetime.strftime("%Y-%m-%d"))) - dates.append(temp_datetime) - if count >= 0: - row = table_row.find_all("td", - class_="ednevnik-seznam_ur_teden-td") - hour_name = str(row[0].find(class_="text14").text) - hour_time = row[0].find(class_="text10").text.replace(" ", "") - hour_times.append(hour_time) - - count2: int = 0 - for row_part in row: - if count2 != 0: - """Pass the first collum that contains hour times""" - date = dates[count2 - 1] - day_num = str(date.weekday()) - date_formatted = str(date.strftime("%Y-%m-%d")) - if day_num not in scraped_data.keys(): - scraped_data.update({str(day_num): {}}) - scraped_data[day_num].update({str(hour_name): {}}) - - if "style" not in row_part.attrs: - data_out = { - "subject": None, - "teacher": None, - "classroom": None, - "group": None, - "event": None, - "hour": hour_name, - "week_day": int(day_num), - "hour_in_block": 0, - "date": date_formatted, - } - scraped_data[day_num][hour_name]["0"] = data_out - else: - classes_in_hour = 0 - for section in row_part: - if type(section) == bs4.element.Tag: - event = None - subject = None - group_raw = None - group = [] - teacher = None - classroom = None - teacher_classroom = None - for img in section.select("img"): - events_list = { - "Odpadla ura": "cancelled", - "Dogodek": "event", - "Nadomeščanje": "substitute", - "Polovična ura": "half_hour", - "Videokonferenca": "video_call", - "Interesna dejavnost": "activity", - "Zaposlitev": "occupation", - "Neopravljena ura": "unfinished_hour", - "Govorilne ure": "office_hours", - "Izpiti": "exams", - } - try: - event = events_list[img.attrs["title"]] - except KeyError: - event = "unknown_event" - - try: - subject = ( - section.find(class_="text14") - .text.replace("\n", "") - .replace("\t", "") - ) - group_raw = section.find_all( - class_="text11 gray bold" - ) - teacher_classroom = ( - section.find(class_="text11") - .text.replace("\n", "") - .replace("\t", "") - .replace("\r", "") - .split(", ") - ) - teacher = teacher_classroom[0] - classroom = teacher_classroom[1] - except IndexError: - pass # Makes it so empty strings don't - # crash the program - except AttributeError: - pass # Makes it so empty strings don't - # crash the program - if group_raw: - for gr in group_raw: - group.append(gr.text) - if ("id" in section.attrs) and bool( - re.match( - r"ednevnik-seznam_ur_teden-blok" - r"-\d\d\d\d\d\d-\d\d\d\d-\d\d-\d\d", - section.attrs["id"], - ) - ): - # Check for blocks - for block in section: - if type(block) == bs4.element.Tag: - event = None - subject = None - group_raw = None - group = [] - teacher = None - classroom = None - teacher_classroom = None - for img in block.select("img"): - events_list = { - "Odpadla ura": "cancelled", - "Dogodek": "event", - "Nadomeščanje": "substitute", - "Polovična ura": "half_hour", - "Videokonferenca": "video_call", - "Interesna dejavnost": "activity", - "Zaposlitev": "occupation", - "Neopravljena ura": "unfinished_hour", - "Govorilne ure": "office hours", - "Izpiti": "exams", - } - try: - event = events_list[ - img.attrs["title"] - ] - except KeyError: - event = "unknown_event" - try: - subject = ( - block.find(class_="text14") - .text.replace("\n", "") - .replace("\t", "") - ) - group_raw = block.find_all( - class_="text11 gray bold" - ) - teacher_classroom = ( - block.find(class_="text11") - .text.replace("\n", "") - .replace("\t", "") - .replace("\r", "") - .split(", ") - ) - teacher = teacher_classroom[0] - classroom = teacher_classroom[ - 1] - except IndexError: - pass - except AttributeError: - pass # Makes it so empty - # strings don't crash the - # program - if group_raw: - for gr in group_raw: - group.append(gr.text) - data_out = { - "subject": subject, - "teacher": teacher, - "classroom": classroom, - "group": group, - "event": event, - "hour": hour_name, - "week_day": int(day_num), - "hour_in_block": int( - classes_in_hour), - "date": date_formatted, - } - scraped_data[day_num][hour_name][ - str(classes_in_hour) - ] = data_out - classes_in_hour += 1 - - else: - data_out = { - "subject": subject, - "teacher": teacher, - "classroom": classroom, - "group": group, - "event": event, - "hour": hour_name, - "week_day": int(day_num), - "hour_in_block": int(classes_in_hour), - "date": date_formatted, - } - scraped_data[day_num][hour_name][ - str(classes_in_hour) - ] = data_out - classes_in_hour += 1 - count2 += 1 - count += 1 - scraped_data["request_data"] = {} - scraped_data["request_data"]["hour_times"] = hour_times - scraped_data["request_data"]["dates"] = dates_formatted - scraped_data["request_data"]["class"] = current_class - scraped_data["request_data"]["request_week"] = current_week - scraped_data["request_data"]["request_epoch"] = request_time - scraped_data["request_data"]["used_data"] = \ - { - "school_id": school_id, - "class_id": class_id, - "professor": professor, - "classroom": classroom, - "interest_activity": interest_activity, - "school_week": school_week, - "student_id": student_id - } - - return scraped_data