diff --git a/src/eAsisitent_scraper/scraper.py b/src/eAsisitent_scraper/scraper.py index fe72aa5..bb7e795 100644 --- a/src/eAsisitent_scraper/scraper.py +++ b/src/eAsisitent_scraper/scraper.py @@ -21,12 +21,12 @@ def request_schedule( :param school_id: The ID of the school you want to get data for :type school_id: str - :param class_id: The ID of the class you want to get data for, defaults to 0 (optional), defaults to 0 (optional) - :param professor: The ID of the professor you want to get data for, defaults to 0 (optional), defaults to 0 (optional) - :param classroom: The classroom you want to get data for, defaults to 0 (optional), defaults to 0 (optional) - :param interest_activity: The activity you want to get data for, defaults to 0 (optional) - :param school_week: 0 is the current week, 1 is the next week, 2 is the week after that, etc, defaults to 0 (optional) - :param student_id: The ID of the student you want to get the schedule for, defaults to 0 (optional) + :param class_id: The ID of the class you want to get data for, 0 is all classes, defaults to 0 (optional) + :param professor: The ID of the professor you want to get data for, 0 is all professors, defaults to 0 (optional) + :param classroom: The classroom you want to get data for, 0 is all classrooms, defaults to 0 (optional) + :param interest_activity: The activity you want to get data for, 0 is all interest activities, defaults to 0 (optional) + :param school_week: school week that you want to get the data for, 0 is the current week, defaults to 0 (optional) + :param student_id: The ID of the student you want to get the schedule for,0 is all students, defaults to 0 (optional) :param soup: Return a BeautifulSoup object (optional) :return: A response object is a requests.models.Response object. @@ -47,20 +47,6 @@ def request_schedule( today = datetime.date.today() -def hour_to_num(hour: str): - """ - Convert hour name to integer - - :param hour: the hour that you want to be converted to int - :type hour: str - :return: The hour as an integer. - """ - if hour.lower() == "predura": - return int(0) - else: - return int(hour.split(". ura")[0]) - - def get_schedule_data( school_id: str, class_id=0, @@ -76,18 +62,17 @@ def get_schedule_data( :param school_id: The ID of the school you want to get data for :type school_id: str - :param class_id: The ID of the class you want to get data for, defaults to 0 (optional), defaults to 0 (optional) - :param professor: The ID of the professor you want to get data for, defaults to 0 (optional), defaults to 0 (optional) - :param classroom: The classroom you want to get data for, defaults to 0 (optional), defaults to 0 (optional) - :param interest_activity: The activity you want to get data for, defaults to 0 (optional) - :param school_week: 0 is the current week, 1 is the next week, 2 is the week after that, etc, defaults to 0 (optional) - :param student_id: The ID of the student you want to get the schedule for, defaults to 0 (optional) + :param class_id: The ID of the class you want to get data for, 0 is all classes, defaults to 0 (optional) + :param professor: The ID of the professor you want to get data for, 0 is all professors, defaults to 0 (optional) + :param classroom: The classroom you want to get data for, 0 is all classrooms, defaults to 0 (optional) + :param interest_activity: The activity you want to get data for, 0 is all interest activities, defaults to 0 (optional) + :param school_week: school week that you want to get the data for, 0 is the current week, defaults to 0 (optional) + :param student_id: The ID of the student you want to get the schedule for,0 is all students, defaults to 0 (optional) :return: A dictionary with the data. """ # TODO: reduce complexity of the function, # better naming of variables, - # get template for scraped_data from template.json response = request_schedule(school_id=school_id, class_id=class_id, professor=professor, @@ -96,7 +81,7 @@ def get_schedule_data( school_week=school_week, student_id=student_id) soup = BeautifulSoup(response.text, "html5lib") - seznam_ur_teden = soup.select("body > table > tbody > tr") + table_rows = soup.select("body > table > tbody > tr") count: int = -1 @@ -104,56 +89,56 @@ def get_schedule_data( dates_formatted: list = [] hour_times: list = [] - scraped_data: dict = {str(i): {str(j): {} for j in range(15)} for i in range(7)} - scraped_data["week_data"] = {"hour_times": [], "dates": [], "current_week": "", "class": ""} + scraped_data: dict = {str(i): {} for i in range(7)} current_week = int("".join(re.findall("[0-9]", [item.text.split(",")[0] for item in soup.select("body > div > span")][0]))) current_class = str([item.text.strip() for item in soup.select("body > div > strong")][0]) - for i in seznam_ur_teden: + for table_row in table_rows: if count == -1: - for days in i: + for days in table_row: if type(days) == bs4.element.Tag: day = days.select("div") if day[0].text != "Ura": temp_date = re.findall(r"[^A-z,. ]+", day[1].text) temp_datetime = datetime.datetime( - day=int(temp_date[0]), - month=int(temp_date[1]), - year=today.year, - ) + day=int(temp_date[0]), + month=int(temp_date[1]), + year=today.year, + ) dates_formatted.append(str(temp_datetime.strftime("%Y-%m-%d"))) dates.append(temp_datetime) if count >= 0: - row = i.find_all("td", class_="ednevnik-seznam_ur_teden-td") - hour_name = row[0].find(class_="text14").text + row = table_row.find_all("td", class_="ednevnik-seznam_ur_teden-td") + hour_name = str(row[0].find(class_="text14").text) hour_time = row[0].find(class_="text10").text hour_times.append(hour_time) - hour_num = str(hour_to_num(hour_name)) - hour_num = str(hour_num) + count2: int = 0 - for block in row: + for row_part in row: if count2 != 0: """Pass the first collum that contains hour times""" date = dates[count2 - 1] day_num = str(date.weekday()) date_formatted = str(date.strftime("%Y-%m-%d")) - if "style" not in block.attrs: + scraped_data[day_num].update({str(hour_name): {}}) + + if "style" not in row_part.attrs: data_out = { "subject": None, "teacher": None, "classroom": None, "group": None, "event": None, - "hour": int(hour_num), + "hour": hour_name, "week_day": int(day_num), "hour_in_block": 0, "date": date_formatted, } - scraped_data[day_num][hour_num]["0"] = data_out + scraped_data[day_num][hour_name]["0"] = data_out else: classes_in_hour = 0 - for section in block: + for section in row_part: if type(section) == bs4.element.Tag: event = None subject = None @@ -199,10 +184,9 @@ def get_schedule_data( teacher = teacher_classroom[0] classroom = teacher_classroom[1] except IndexError: - pass + pass # Makes it so empty strings don't crash the program except AttributeError: - """Makes it so empty strings don't crash the program""" - pass + pass # Makes it so empty strings don't crash the program if group_raw: for gr in group_raw: group.append(gr.text) @@ -212,9 +196,9 @@ def get_schedule_data( section.attrs["id"], ) ): - """Check for blocks""" - for block_part in section: - if type(block_part) == bs4.element.Tag: + # Check for blocks + for block in section: + if type(block) == bs4.element.Tag: event = None subject = None group_raw = None @@ -222,7 +206,7 @@ def get_schedule_data( teacher = None classroom = None teacher_classroom = None - for img in block_part.select("img"): + for img in block.select("img"): events_list = { "Odpadla ura": "cancelled", "Dogodek": "event", @@ -243,15 +227,15 @@ def get_schedule_data( event = "unknown_event" try: subject = ( - block_part.find(class_="text14") + block.find(class_="text14") .text.replace("\n", "") .replace("\t", "") ) - group_raw = block_part.find_all( + group_raw = block.find_all( class_="text11 gray bold" ) teacher_classroom = ( - block_part.find(class_="text11") + block.find(class_="text11") .text.replace("\n", "") .replace("\t", "") .replace("\r", "") @@ -262,8 +246,7 @@ def get_schedule_data( except IndexError: pass except AttributeError: - """Makes it so empty strings don't crash the program""" - pass + pass # Makes it so empty strings don't crash the program if group_raw: for gr in group_raw: group.append(gr.text) @@ -273,12 +256,12 @@ def get_schedule_data( "classroom": classroom, "group": group, "event": event, - "hour": int(hour_num), + "hour": hour_name, "week_day": int(day_num), "hour_in_block": int(classes_in_hour), "date": date_formatted, } - scraped_data[day_num][hour_num][ + scraped_data[day_num][hour_name][ classes_in_hour ] = data_out classes_in_hour += 1 @@ -290,20 +273,20 @@ def get_schedule_data( "classroom": classroom, "group": group, "event": event, - "hour": int(hour_num), + "hour": hour_name, "week_day": int(day_num), "hour_in_block": int(classes_in_hour), "date": date_formatted, } - scraped_data[day_num][hour_num][ + scraped_data[day_num][hour_name][ classes_in_hour ] = data_out classes_in_hour += 1 count2 += 1 count += 1 + scraped_data["week_data"] = {"hour_times": [], "dates": [], "current_week": "", "class": ""} scraped_data["week_data"]["hour_times"] = hour_times scraped_data["week_data"]["dates"] = dates_formatted scraped_data["week_data"]["current_week"] = current_week scraped_data["week_data"]["class"] = current_class - return scraped_data