diff --git a/PoliNetwork.Graduatorie.Common/Enums/SchoolEnum.cs b/PoliNetwork.Graduatorie.Common/Enums/SchoolEnum.cs index a73518b8..66b6eced 100644 --- a/PoliNetwork.Graduatorie.Common/Enums/SchoolEnum.cs +++ b/PoliNetwork.Graduatorie.Common/Enums/SchoolEnum.cs @@ -16,4 +16,19 @@ public enum SchoolEnum Architettura = 3, Design = 4, Unknown = 0 +} + +public static class SchoolEnumMethods +{ + public static string ToShortName(this SchoolEnum s) + { + return s switch + { + SchoolEnum.Architettura => "ARC", + SchoolEnum.Design => "DES", + SchoolEnum.Ingegneria => "ENG", + SchoolEnum.Urbanistica => "URB", + _ => "UNK", + }; + } } \ No newline at end of file diff --git a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/IndexJsonBase.cs b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/IndexJsonBase.cs index d01be1da..599b87ca 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/IndexJsonBase.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/IndexJsonBase.cs @@ -57,7 +57,7 @@ public static void WriteSingleJsons(RankingsSet? set, string outFolder, ArgsConf private static void WriteSingleJsonRanking(string folder, Ranking ranking, ArgsConfig argsConfig) { - var path = Path.Join(folder, ranking.ConvertPhaseToFilename()); + var path = Path.Join(folder, ranking.GetFilename()); if (ExitIfAlreadyExistsAndNotUpdated(ranking, path) && !argsConfig.ForceReparsing) return; diff --git a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearCourseJson.cs b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearCourseJson.cs index 0f6a5b53..900968e6 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearCourseJson.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearCourseJson.cs @@ -132,7 +132,7 @@ bool IsThisCourse(SingleCourseJson x) private static int Comparison(SingleCourseJson x, SingleCourseJson y) { - return x.Compare(y); + return x.CompareTo(y); } private static SingleCourseJson CreateCourseJson(Ranking ranking, CourseTable course) @@ -140,8 +140,8 @@ private static SingleCourseJson CreateCourseJson(Ranking ranking, CourseTable co var basePath = ranking.School + "/" + ranking.Year + "/"; return new SingleCourseJson { - Link = ranking.ConvertPhaseToFilename(), - Name = ranking.RankingOrder?.Phase, + Link = ranking.GetFilename(), + Id = ranking.GetId(), BasePath = basePath, Year = ranking.Year, School = ranking.School, @@ -159,7 +159,7 @@ bool Predicate(Ranking v1) { return singleCourseJson.School == v1.School && singleCourseJson.Year == v1.Year - && v1.RankingOrder?.Phase == singleCourseJson.Name; + && singleCourseJson.RankingOrder?.GetId() == v1.RankingOrder?.GetId(); } } diff --git a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearJson.cs b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearJson.cs index a0749948..d93cd3fa 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearJson.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/BySchoolYearJson.cs @@ -62,11 +62,10 @@ IDictionary> schoolDict .DistinctBy(x => x.Link) .ToList(); var filenames = singleCourseJsons - .OrderBy(a => a.Name) + .OrderBy(a => a.Id) .ThenBy(a => a.Year) .ThenBy(a => a.School) .ThenBy(a => a.BasePath) - .ThenBy(a => a.Link) .ToList(); schoolDict.Add(yearGroupKey.Value, filenames); diff --git a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/ByYearSchoolJson.cs b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/ByYearSchoolJson.cs index c5bfb610..7206836c 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/ByYearSchoolJson.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/Json/Indexes/Specific/ByYearSchoolJson.cs @@ -42,7 +42,7 @@ public class ByYearSchoolJson : IndexJsonBase var filenames = schoolGroup .SelectMany(ranking => ranking.ToSingleCourseJson()) .DistinctBy(x => x.Link) - .ToList().OrderBy(a => a.Name); + .ToList().OrderBy(a => a.Id); yearDict.Add(schoolGroup.Key.Value, filenames); } diff --git a/PoliNetwork.Graduatorie.Parser/Objects/Json/SingleCourseJson.cs b/PoliNetwork.Graduatorie.Parser/Objects/Json/SingleCourseJson.cs index dc487970..bdb25dc1 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/Json/SingleCourseJson.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/Json/SingleCourseJson.cs @@ -12,12 +12,12 @@ namespace PoliNetwork.Graduatorie.Parser.Objects.Json; [Serializable] [JsonObject(MemberSerialization.Fields, NamingStrategyType = typeof(CamelCaseNamingStrategy))] -public class SingleCourseJson +public class SingleCourseJson: IComparable { public string? BasePath; public string? Link; public string? Location; - public string? Name; + public string? Id; public RankingOrder? RankingOrder; public SchoolEnum? School; public int? Year; @@ -25,7 +25,7 @@ public class SingleCourseJson public int GetHashWithoutLastUpdate() { var hashWithoutLastUpdate = Link?.GetHashCode() ?? "Link".GetHashCode(); - var hashCode = Name?.GetHashCode() ?? "Name".GetHashCode(); + var hashCode = Id?.GetHashCode() ?? "Id".GetHashCode(); var basePathInt = BasePath?.GetHashCode() ?? "BasePath".GetHashCode(); var yearInt = Year?.GetHashCode() ?? "Year".GetHashCode(); var schoolInt = School?.GetHashCode() ?? "School".GetHashCode(); @@ -33,8 +33,10 @@ public int GetHashWithoutLastUpdate() return hashWithoutLastUpdate ^ hashCode ^ basePathInt ^ yearInt ^ schoolInt ^ code; } - public int Compare(SingleCourseJson singleCourseJson) + public int CompareTo(SingleCourseJson? singleCourseJson) { + if (singleCourseJson == null) return 1; + if (Year != singleCourseJson.Year) return (Year ?? -1) < (singleCourseJson.Year ?? -1) ? -1 : 1; @@ -49,14 +51,14 @@ public int Compare(SingleCourseJson singleCourseJson) if (Location != singleCourseJson.Location) return string.Compare(Location ?? "", singleCourseJson.Location ?? "", StringComparison.InvariantCulture); - if (Name != singleCourseJson.Name) - return string.Compare(Name ?? "", singleCourseJson.Name ?? "", StringComparison.InvariantCulture); + if (Id != singleCourseJson.Id) + return string.Compare(Id ?? "", singleCourseJson.Id ?? "", StringComparison.InvariantCulture); return 0; } public bool Is(CourseTable courseTable) { - return Name == courseTable.Title; + return (RankingOrder?.Phase ?? "") == courseTable.Title; } } \ No newline at end of file diff --git a/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs b/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs index 7778b72f..eb23b7dd 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/Ranking.cs @@ -96,11 +96,29 @@ private void MergeRankingOrder(Ranking ranking) RankingOrder.Merge(ranking.RankingOrder); } - public string ConvertPhaseToFilename() + public string GetFilename() { - var s = DateTime.UtcNow.ToString("yyyyMMddTHHmmss", CultureInfo.InvariantCulture) + "Z"; - var phase1 = RankingOrder?.Phase ?? s; - return $"{phase1}.json".Replace(" ", "_"); + var id = GetId(); + return $"{id}.json"; + } + + public string GetId() + { + var idList = new List(); + + var schoolShort = School?.ToShortName(); + if (schoolShort != null) idList.Add(schoolShort); + + var yearStr = Year.ToString(); + if (yearStr != null) idList.Add(yearStr); + + var orderId = RankingOrder?.GetId(); + if (orderId != null) idList.Add(orderId); + + var fallback = DateTime.UtcNow.ToString("yyyyMMddTHHmmss", CultureInfo.InvariantCulture) + "Z"; + if (idList.Count == 0) idList.Add(fallback); + + return string.Join("_", idList); } public List ToSingleCourseJson() @@ -111,8 +129,8 @@ public List ToSingleCourseJson() if (courseTables == null) return result; result.AddRange(courseTables.Select(variable => new SingleCourseJson { - Link = ConvertPhaseToFilename(), - Name = RankingOrder?.Phase, + Link = GetFilename(), + Id = GetId(), BasePath = schoolString + "/" + Year + "/", Year = Year, School = School, diff --git a/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/RankingOrder.cs b/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/RankingOrder.cs index 5a24d668..140b4429 100644 --- a/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/RankingOrder.cs +++ b/PoliNetwork.Graduatorie.Parser/Objects/RankingNS/RankingOrder.cs @@ -21,14 +21,16 @@ public class RankingOrder //prima graduatoria di seconda fase:{primary:2, secondary:1} public int? Primary; public int? Secondary; + public bool IsEnglish = false; public RankingOrder() { } - public RankingOrder(string phase) + public RankingOrder(string phase, bool isEnglish = false) { Phase = phase; + IsEnglish = isEnglish; FixValues(); } @@ -62,6 +64,10 @@ private void FixValues() "TERZA" => 3, "QUARTA" => 4, "QUINTA" => 5, + "SESTA" => 6, + "SETTIMA" => 7, + "OTTAVA" => 8, + // veramente ci sarĂ  una nona graduatoria? _ => null }; } @@ -69,6 +75,29 @@ private void FixValues() return null; } + public string GetId() + { + var idList = new List(); + if (Anticipata == true) idList.Add($"anticipata"); + if (Primary != null) idList.Add($"{Primary}fase"); + if (Secondary != null) idList.Add($"{Secondary}grad"); + + var cleanPhase = Phase?.Replace("_", "").Replace("-", "").Replace(" ", "_").ToLower() ?? ""; + var noOrder = Anticipata == false && Primary == null && Secondary == null; + var isSingleExtraEu = noOrder && cleanPhase.Contains("extraue"); + + if (noOrder) + { + idList.Add(isSingleExtraEu ? "extraeu" : cleanPhase); + } + + idList.Add(IsEnglish ? "eng" : "ita"); + if (ExtraEu == true && !isSingleExtraEu) idList.Add("extraeu"); // the second condition is to avoid double extraeu + + var id = string.Join("_", idList); + return id; + } + public int GetHashWithoutLastUpdate() { var i = "RankingOrder".GetHashCode(); @@ -88,4 +117,4 @@ public void Merge(RankingOrder? rankingRankingOrder) Secondary ??= rankingRankingOrder?.Secondary; ExtraEu ??= rankingRankingOrder?.ExtraEu; } -} \ No newline at end of file +} diff --git a/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs b/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs index b0696910..d1d1c9ff 100644 --- a/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs +++ b/PoliNetwork.Graduatorie.Parser/Utils/Transformer/ParserNS/Parser.cs @@ -205,8 +205,29 @@ private RankingsSet ParseNewRankings(IReadOnlyCollection htmls) ranking.School = school; ranking.Year = Convert.ToInt16(intestazioni[1].Split("Year ")[1].Split("/")[0]); - var phase = string.Join(" ", intestazioni[3].Split(" - ")[1..]); - ranking.RankingOrder = new RankingOrder(phase); + if (ranking.Year < 2024) { + // layout valid until 2023 + var phase = string.Join(" ", intestazioni[3].Split(" - ")[1..]); + ranking.RankingOrder = new RankingOrder(phase); + if (ranking.School == SchoolEnum.Architettura && ranking.RankingOrder.Primary == null && + ranking.RankingOrder.Secondary == null && ranking.RankingOrder.ExtraEu == true) + { + // this is a fallback for 2020-2023: + // POLIMI was used to add the ranking number (Secondary, e.g. "Prima Graduatoria") for ExtraEU starting + // from the second ranking. + // e.g. Extra-EU first ranking => phase = "Extra-ue", + // Extra-EU second ranking => phase = "Extra-ue - Seconda Graduatoria" + // so this is a fallback to add the equivalent of "Prima Graduatoria" to the first ExtraEU ranking. + + ranking.RankingOrder.Secondary = 1; + } + } else { + // layout valid since 2024 (if the layout changes again, make another else if) + var phase = intestazioni[3]; + var isEnglish = intestazioni[2].Contains("taught in english") || intestazioni[2].Contains("erogati in inglese"); + ranking.RankingOrder = new RankingOrder(phase, isEnglish); + } + ranking.Extra = intestazioni[4]; ranking.LastUpdate = DateTime.UtcNow; ranking.ByCourse = new List(); @@ -750,4 +771,4 @@ private IEnumerable ParseLocalHtmlFiles() ); return obj2; } -} \ No newline at end of file +} diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs index 2b688648..2dc91e58 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/Scraper.cs @@ -11,9 +11,8 @@ namespace PoliNetwork.Graduatorie.Scraper.Utils.Web; public class Scraper { private const string TargetUrl = Constants.RisultatiAmmissionePolimiIt; - private const string HomepageUrl = "https://www.polimi.it"; - private const string FuturiStudentiUrl = "https://www.polimi.it/futuri-studenti"; - private const string InEvidenzaUrl = "https://www.polimi.it/in-evidenza"; + private const string BaseUrl = "https://www.polimi.it"; + private const string AvvisiFuturiStudentiUrl = "https://www.polimi.it/futuri-studenti/avvisi"; private readonly HashSet _alreadyVisited = new(); @@ -28,89 +27,35 @@ public class Scraper public IEnumerable GetRankingsLinks() { - HashSet links = new(); - - var l1 = ScrapeHomepage(); - var l2 = ScrapeFuturiStudenti(); - var l3 = ScrapeInEvidenza(); - - links.AddRange(l1, l2, l3); - return links; + // before there were multiple source to get links. + // atm rankings are published exclusively + // on AvvisiFuturiStudentiUrl and on TG channel + // note: here we are using the web page + return ScrapeAvvisiFuturiStudenti(); } - private IEnumerable ScrapeHomepage() + private IEnumerable ScrapeAvvisiFuturiStudenti() { HashSet links = new(); - var page = _web.Load(HomepageUrl).DocumentNode; + var page = _web.Load(AvvisiFuturiStudentiUrl).DocumentNode; - var slides = page.SelectNodes("//section[@id='copertina']//div[contains(@class, 'sp-slides')]/div"); - if (slides == null) return links; + var newsCards = + page.SelectNodes("//div[contains(@class, 'news')]//div[contains(@class, 'row--card')]//div[contains(@class, 'card__content')]"); + if (newsCards == null) return links; - foreach (var slide in slides) + foreach (var card in newsCards) { - var h1 = slide.Descendants("h1"); - if (h1 == null) continue; - var text = h1.First().InnerText; - if (!IsValidText(text)) continue; - var a = slide.Descendants("a"); - var href = GetHref(a.First()); - links.AddRange(UseHref(href)); - } + var title = card.Descendants("h5").First(); + var titleValid = title != null && IsValidText(title.InnerText); - return links; - } + var body = card.Descendants("p").Where(el => el.ParentNode.HasClass("news-bodytext")).First(); + var bodyValid = body != null && IsValidText(body.InnerText); - private IEnumerable ScrapeFuturiStudenti() - { - HashSet links = new(); - var page = _web.Load(FuturiStudentiUrl).DocumentNode; + var aTag = card.Descendants("a").First(); - var slides = - page.SelectNodes("//section[@id='newsNoThumb' or @id='news']//div[contains(@class, 'sp-slides')]/div"); - if (slides == null) return links; - - foreach (var slide in slides) - { - var h1 = slide.Descendants("h1"); - var h1Valid = h1 != null && IsValidText(h1.First().InnerText); + if (!titleValid && !bodyValid && aTag != null) continue; - var p = slide.Descendants("p"); - var pValid = p != null && IsValidText(p.First().InnerText); - - - if (!h1Valid && !pValid) continue; - var aTags = slide.Descendants("a"); - foreach (var a in aTags) - { - var href = GetHref(a); - links.AddRange(UseHref(href)); - } - } - - return links; - } - - private IEnumerable ScrapeInEvidenza() - { - HashSet links = new(); - var page = _web.Load(InEvidenzaUrl).DocumentNode; - - var liTags = page.SelectNodes("//div[@id='content']//li"); - if (liTags == null) return links; - - foreach (var li in liTags) - { - var h3 = li.GetElementsByTagName("h3"); - - var a = h3.First().ChildNodes[0]; - var aValid = a != null && IsValidText(a.InnerText); - - var p = li.Descendants("p"); - var pValid = p != null && IsValidText(p.First().InnerText); - - if (!aValid && !pValid) continue; - - var href = GetHref(a); + var href = GetHref(aTag); links.AddRange(UseHref(href)); } @@ -128,7 +73,7 @@ private IEnumerable UseHref(string? href) } else { - var url = UrlUtils.UrlifyLocalHref(href, HomepageUrl); + var url = UrlUtils.UrlifyLocalHref(href, BaseUrl); links.AddRange(ParseNewsPage(url)); } @@ -142,7 +87,7 @@ private IEnumerable ParseNewsPage(string url) var page = _web.Load(url).DocumentNode; - var aTags = page.SelectNodes("//div[@id='content']//a[@href]"); + var aTags = page.SelectNodes("//div[contains(@class, 'news-text-wrap')]//a[@href]"); if (aTags == null) return links; foreach (var a in aTags) @@ -189,4 +134,4 @@ private bool IsValidText(string text) return null; } } -} \ No newline at end of file +} diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs index e54ea691..fcf0d953 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs @@ -18,7 +18,8 @@ public static class UrlUtils /// The full url public static string UrlifyLocalHref(string href, string? domain) { - return domain != null && !href.Contains(domain) ? domain + href : href; + if (href.StartsWith("https://") || domain == null) return href; + return !href.Contains(domain) ? domain + href : href; } public static bool CheckUrl(RankingUrl? url) @@ -39,4 +40,4 @@ public static bool CheckUrl(RankingUrl? url) return false; } } -} \ No newline at end of file +}