diff --git a/PoliNetwork.Graduatorie.Common/Utils/EnrollUtil.cs b/PoliNetwork.Graduatorie.Common/Utils/EnrollUtil.cs index a5446ec6..702efc24 100644 --- a/PoliNetwork.Graduatorie.Common/Utils/EnrollUtil.cs +++ b/PoliNetwork.Graduatorie.Common/Utils/EnrollUtil.cs @@ -23,13 +23,11 @@ public static EnrollType GetEnrollType(string? rowCanEnrollInto, bool rowCanEnro var s = rowCanEnrollInto.Split(sep).ToList(); var type = s.FirstOrDefault(x => tester.Any(t => t == x.ToLower())); - if (type != null) - { - s.Remove(type); - } + + if (type != null) s.Remove(type); var course = string.Join(sep, s); - return new EnrollType { CanEnroll = true, Course = course, Type = type }; - + return new EnrollType { CanEnroll = true, Course = course, Type = type }; + } } \ No newline at end of file diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/CheckUrlUtil.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/CheckUrlUtil.cs new file mode 100644 index 00000000..5c9809fb --- /dev/null +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/CheckUrlUtil.cs @@ -0,0 +1,53 @@ +#region + +using PoliNetwork.Graduatorie.Common.Enums; +using PoliNetwork.Graduatorie.Common.Objects.RankingNS; + +#endregion + +namespace PoliNetwork.Graduatorie.Scraper.Utils.Web; + +public class CheckUrlUtil +{ + public static void CheckUrl(RankingUrl variable, HashSet final) + { + try + { + var x = UrlUtils.CheckUrl(variable); + if (!x) return; + lock (final) + { + final.Add(variable); + } + } + catch (Exception exception) + { + Console.WriteLine(exception); + } + } + + public static HashSet GetRankingLinks(IEnumerable rankingsLinks) + { + var parallelQuery = rankingsLinks + .AsParallel() + .Select(RankingUrl.From) + .Where(r => r.PageEnum == PageEnum.Index).ToList(); + + return GetRankingLinksHashSet(parallelQuery); + } + + public static HashSet GetRankingLinksHashSet(IEnumerable parallelQuery) + { + var final = new HashSet(); + + var action = parallelQuery.Select((Func)Selector).ToArray(); + Parallel.Invoke(action); + + return final; + + Action Selector(RankingUrl variable) + { + return () => { CheckUrl(variable, final); }; + } + } +} \ No newline at end of file diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/LinksFind.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/LinksFind.cs index 947ce73d..77d782ff 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/Web/LinksFind.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/LinksFind.cs @@ -2,7 +2,6 @@ using PoliNetwork.Core.Utils; using PoliNetwork.Graduatorie.Common.Data; -using PoliNetwork.Graduatorie.Common.Enums; using PoliNetwork.Graduatorie.Common.Extensions; using PoliNetwork.Graduatorie.Common.Objects.RankingNS; @@ -22,27 +21,23 @@ public static IEnumerable GetAll() var rankingsLinks = new HashSet(); rankingsLinks.AddRange(polimiNewsLinks, combinationLinks); - var rankingsUrls = rankingsLinks - .AsParallel() // from 500ms to 86ms - .Select(RankingUrl.From) - .Where(r => r.PageEnum == PageEnum.Index) - .Where(UrlUtils.CheckUrl) - .ToHashSet(); + var rankingsUrls = CheckUrlUtil.GetRankingLinks(rankingsLinks); var len = rankingsUrls.ToArray().Length; Console.WriteLine($"[INFO] LinksFind.GetAll found {len} links"); return rankingsUrls; } + private static IEnumerable GetCombinationLinks() { var r = new HashSet(); var nowYear = DateTime.UtcNow.Year; - for (var i = 2021; i <= nowYear; i++) r.AddRange(GetYearCominationLinks(i)); + for (var i = 2021; i <= nowYear; i++) r.AddRange(GetYearCombinationLinks(i)); return r; } - private static IEnumerable GetYearCominationLinks(int year) + private static IEnumerable GetYearCombinationLinks(int year) { // partial implemented: polimi has recently added 4 hex chars in the first part // of the path (2022_20064_XXXX_html/) which would require 65k combinations for each diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/ScraperOutput.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/ScraperOutput.cs index 3a3e768f..cf79fe07 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/Web/ScraperOutput.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/ScraperOutput.cs @@ -66,7 +66,7 @@ public static void Write(List rankingsUrls, string? dataFolder) private static string GetOutputLinksString(IEnumerable rankingsUrls) { var output = ""; - var urls = rankingsUrls.Where(UrlUtils.CheckUrl).Select(x => x.Url).Order(); + var urls = CheckUrlUtil.GetRankingLinksHashSet(rankingsUrls).Order(); foreach (var link in urls) { output += link; diff --git a/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs b/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs index 1e3eb03b..e54ea691 100644 --- a/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs +++ b/PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs @@ -23,13 +23,15 @@ public static string UrlifyLocalHref(string href, string? domain) public static bool CheckUrl(RankingUrl? url) { - if (string.IsNullOrEmpty(url?.Url)) + var urlUrl = url?.Url; + if (string.IsNullOrEmpty(urlUrl)) return false; using var client = new HttpClient(); try { - var response = client.GetAsync(url.Url).Result; + var async = client.GetAsync(urlUrl); + var response = async.Result; return response.StatusCode == HttpStatusCode.OK; } catch (HttpRequestException)