Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions PoliNetwork.Graduatorie.Common/Utils/EnrollUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,11 @@ public static EnrollType GetEnrollType(string? rowCanEnrollInto, bool rowCanEnro

var s = rowCanEnrollInto.Split(sep).ToList();
var type = s.FirstOrDefault(x => tester.Any(t => t == x.ToLower()));
if (type != null)
{
s.Remove(type);
}

if (type != null) s.Remove(type);

var course = string.Join(sep, s);
return new EnrollType { CanEnroll = true, Course = course, Type = type };
return new EnrollType { CanEnroll = true, Course = course, Type = type };

}
}
53 changes: 53 additions & 0 deletions PoliNetwork.Graduatorie.Scraper/Utils/Web/CheckUrlUtil.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#region

using PoliNetwork.Graduatorie.Common.Enums;
using PoliNetwork.Graduatorie.Common.Objects.RankingNS;

#endregion

namespace PoliNetwork.Graduatorie.Scraper.Utils.Web;

public class CheckUrlUtil
{
public static void CheckUrl(RankingUrl variable, HashSet<RankingUrl> final)
{
try
{
var x = UrlUtils.CheckUrl(variable);
if (!x) return;
lock (final)
{
final.Add(variable);
}
}
catch (Exception exception)
{
Console.WriteLine(exception);
}
}

public static HashSet<RankingUrl> GetRankingLinks(IEnumerable<string> rankingsLinks)
{
var parallelQuery = rankingsLinks
.AsParallel()
.Select(RankingUrl.From)
.Where(r => r.PageEnum == PageEnum.Index).ToList();

return GetRankingLinksHashSet(parallelQuery);
}

public static HashSet<RankingUrl> GetRankingLinksHashSet(IEnumerable<RankingUrl> parallelQuery)
{
var final = new HashSet<RankingUrl>();

var action = parallelQuery.Select((Func<RankingUrl, Action>)Selector).ToArray();
Parallel.Invoke(action);

return final;

Action Selector(RankingUrl variable)
{
return () => { CheckUrl(variable, final); };
}
}
}
13 changes: 4 additions & 9 deletions PoliNetwork.Graduatorie.Scraper/Utils/Web/LinksFind.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

using PoliNetwork.Core.Utils;
using PoliNetwork.Graduatorie.Common.Data;
using PoliNetwork.Graduatorie.Common.Enums;
using PoliNetwork.Graduatorie.Common.Extensions;
using PoliNetwork.Graduatorie.Common.Objects.RankingNS;

Expand All @@ -22,27 +21,23 @@ public static IEnumerable<RankingUrl> GetAll()
var rankingsLinks = new HashSet<string>();
rankingsLinks.AddRange(polimiNewsLinks, combinationLinks);

var rankingsUrls = rankingsLinks
.AsParallel() // from 500ms to 86ms
.Select(RankingUrl.From)
.Where(r => r.PageEnum == PageEnum.Index)
.Where(UrlUtils.CheckUrl)
.ToHashSet();
var rankingsUrls = CheckUrlUtil.GetRankingLinks(rankingsLinks);

var len = rankingsUrls.ToArray().Length;
Console.WriteLine($"[INFO] LinksFind.GetAll found {len} links");
return rankingsUrls;
}


private static IEnumerable<string> GetCombinationLinks()
{
var r = new HashSet<string>();
var nowYear = DateTime.UtcNow.Year;
for (var i = 2021; i <= nowYear; i++) r.AddRange(GetYearCominationLinks(i));
for (var i = 2021; i <= nowYear; i++) r.AddRange(GetYearCombinationLinks(i));
return r;
}

private static IEnumerable<string> GetYearCominationLinks(int year)
private static IEnumerable<string> GetYearCombinationLinks(int year)
{
// partial implemented: polimi has recently added 4 hex chars in the first part
// of the path (2022_20064_XXXX_html/) which would require 65k combinations for each
Expand Down
2 changes: 1 addition & 1 deletion PoliNetwork.Graduatorie.Scraper/Utils/Web/ScraperOutput.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public static void Write(List<RankingUrl> rankingsUrls, string? dataFolder)
private static string GetOutputLinksString(IEnumerable<RankingUrl> rankingsUrls)
{
var output = "";
var urls = rankingsUrls.Where(UrlUtils.CheckUrl).Select(x => x.Url).Order();
var urls = CheckUrlUtil.GetRankingLinksHashSet(rankingsUrls).Order();
foreach (var link in urls)
{
output += link;
Expand Down
6 changes: 4 additions & 2 deletions PoliNetwork.Graduatorie.Scraper/Utils/Web/UrlUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,15 @@ public static string UrlifyLocalHref(string href, string? domain)

public static bool CheckUrl(RankingUrl? url)
{
if (string.IsNullOrEmpty(url?.Url))
var urlUrl = url?.Url;
if (string.IsNullOrEmpty(urlUrl))
return false;

using var client = new HttpClient();
try
{
var response = client.GetAsync(url.Url).Result;
var async = client.GetAsync(urlUrl);
var response = async.Result;
return response.StatusCode == HttpStatusCode.OK;
}
catch (HttpRequestException)
Expand Down