Updated scanner logic, and added initial scanner tests.
This commit is contained in:
@@ -34,6 +34,16 @@ public class DLSiteSearchFilterBuilder
|
||||
return this;
|
||||
}
|
||||
|
||||
public DLSiteSearchFilterBuilder IncludeSupportedLanguages(ISupportedLanguage[] languages)
|
||||
{
|
||||
foreach (ISupportedLanguage language in languages)
|
||||
{
|
||||
IncludeSupportedLanguage(language);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public DLSiteSearchFilterBuilder IncludeSupportedLanguage(ISupportedLanguage language)
|
||||
{
|
||||
AddToOptionsAnd(language.Code);
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
using JSMR.Application.Common.Caching;
|
||||
using JSMR.Infrastructure.Common.Locales;
|
||||
using JSMR.Infrastructure.Common.Locales;
|
||||
using JSMR.Infrastructure.Common.SupportedLanguages;
|
||||
using JSMR.Infrastructure.Http;
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JSMR.Infrastructure.Scanning;
|
||||
|
||||
public partial class EnglishVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spamCircleCache)
|
||||
: VoiceWorksScanner(loader, spamCircleCache)
|
||||
public partial class EnglishVoiceWorksScanner(IHtmlLoader loader) : VoiceWorksScanner(loader)
|
||||
{
|
||||
[GeneratedRegex(@"Release: (.*?)[/](\d{2})[/](\d{4})", RegexOptions.IgnoreCase, "en-US")]
|
||||
[GeneratedRegex(@"Release date: (.*?)[/](\d{1,2})[/](\d{4})", RegexOptions.IgnoreCase, "en-US")]
|
||||
private static partial Regex SalesDateRegex();
|
||||
|
||||
[GeneratedRegex(@"^(Early|Middle|Late)\s(.*?)\s(\d{4})", RegexOptions.IgnoreCase, "en-US")]
|
||||
@@ -24,140 +23,47 @@ public partial class EnglishVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCac
|
||||
new AlingualLanguage()
|
||||
];
|
||||
|
||||
protected override DateTime? GetEstimatedReleaseDate(string expectedDate)
|
||||
protected override DateOnly? GetEstimatedReleaseDate(string expectedDate)
|
||||
{
|
||||
if (expectedDate.Contains("販売中") || expectedDate.Contains("発売予定未定"))
|
||||
if (expectedDate.Contains("Release Date: TBC", StringComparison.OrdinalIgnoreCase))
|
||||
return null;
|
||||
|
||||
Regex textRegex = EstimatedDateRegex();
|
||||
MatchCollection textMatches = textRegex.Matches(expectedDate);
|
||||
Match match = EstimatedDateRegex().Match(expectedDate);
|
||||
|
||||
if (textMatches.Count == 0 || textMatches[0].Groups.Count < 4)
|
||||
if (match.Success == false)
|
||||
return null;
|
||||
|
||||
GroupCollection groups = textMatches[0].Groups;
|
||||
GroupCollection groups = match.Groups;
|
||||
|
||||
int releaseYear = Convert.ToInt32(groups[3].Value);
|
||||
|
||||
int releaseMonth = 1;
|
||||
int releaseDay = 1;
|
||||
|
||||
string releaseTime = groups[1].Value;
|
||||
string releaseMonthText = groups[2].Value;
|
||||
|
||||
switch (releaseTime)
|
||||
int day = groups[1].Value.ToLowerInvariant() switch
|
||||
{
|
||||
case "Early":
|
||||
releaseDay = 1;
|
||||
break;
|
||||
case "Middle":
|
||||
releaseDay = 11;
|
||||
break;
|
||||
case "Late":
|
||||
releaseDay = 21;
|
||||
break;
|
||||
}
|
||||
"early" => 1,
|
||||
"middle" => 11,
|
||||
"late" => 21,
|
||||
_ => 1
|
||||
};
|
||||
|
||||
switch (releaseMonthText)
|
||||
{
|
||||
case "Jan.":
|
||||
releaseMonth = 1;
|
||||
break;
|
||||
case "Feb.":
|
||||
releaseMonth = 2;
|
||||
break;
|
||||
case "Mar.":
|
||||
releaseMonth = 3;
|
||||
break;
|
||||
case "Apr.":
|
||||
releaseMonth = 4;
|
||||
break;
|
||||
case "May.":
|
||||
releaseMonth = 5;
|
||||
break;
|
||||
case "Jun.":
|
||||
releaseMonth = 6;
|
||||
break;
|
||||
case "Jul.":
|
||||
releaseMonth = 7;
|
||||
break;
|
||||
case "Aug.":
|
||||
releaseMonth = 8;
|
||||
break;
|
||||
case "Sep.":
|
||||
releaseMonth = 9;
|
||||
break;
|
||||
case "Oct.":
|
||||
releaseMonth = 10;
|
||||
break;
|
||||
case "Nov.":
|
||||
releaseMonth = 11;
|
||||
break;
|
||||
case "Dec.":
|
||||
releaseMonth = 12;
|
||||
break;
|
||||
}
|
||||
string monthAbbreviation = groups[2].Value.Replace(".", "");
|
||||
int month = DateTime.ParseExact(monthAbbreviation, "MMM", CultureInfo.InvariantCulture).Month;
|
||||
|
||||
return new DateTime(releaseYear, releaseMonth, releaseDay);
|
||||
int year = Convert.ToInt32(groups[3].Value);
|
||||
|
||||
return new DateOnly(year, month, day);
|
||||
}
|
||||
|
||||
protected override DateTime? GetSalesDate(string salesDate)
|
||||
protected override DateOnly? GetSalesDate(string salesDate)
|
||||
{
|
||||
Regex textRegex = SalesDateRegex();
|
||||
MatchCollection textMatches = textRegex.Matches(salesDate);
|
||||
Match match = SalesDateRegex().Match(salesDate);
|
||||
|
||||
if (textMatches.Count == 0 || textMatches[0].Groups.Count < 4)
|
||||
if (match.Success == false)
|
||||
return null;
|
||||
|
||||
string month = textMatches[0].Groups[1].Value;
|
||||
int releaseMonth = -1;
|
||||
string monthAbbreviation = match.Groups[1].Value;
|
||||
int day = int.Parse(match.Groups[2].Value);
|
||||
int year = int.Parse(match.Groups[3].Value);
|
||||
|
||||
switch (month)
|
||||
{
|
||||
case "Jan":
|
||||
releaseMonth = 1;
|
||||
break;
|
||||
case "Feb":
|
||||
releaseMonth = 2;
|
||||
break;
|
||||
case "Mar":
|
||||
releaseMonth = 3;
|
||||
break;
|
||||
case "Apr":
|
||||
releaseMonth = 4;
|
||||
break;
|
||||
case "May":
|
||||
releaseMonth = 5;
|
||||
break;
|
||||
case "Jun":
|
||||
releaseMonth = 6;
|
||||
break;
|
||||
case "Jul":
|
||||
releaseMonth = 7;
|
||||
break;
|
||||
case "Aug":
|
||||
releaseMonth = 8;
|
||||
break;
|
||||
case "Sep":
|
||||
releaseMonth = 9;
|
||||
break;
|
||||
case "Oct":
|
||||
releaseMonth = 10;
|
||||
break;
|
||||
case "Nov":
|
||||
releaseMonth = 11;
|
||||
break;
|
||||
case "Dec":
|
||||
releaseMonth = 12;
|
||||
break;
|
||||
}
|
||||
int month = DateTime.ParseExact(monthAbbreviation, "MMM", CultureInfo.InvariantCulture).Month;
|
||||
|
||||
if (releaseMonth == -1)
|
||||
return null;
|
||||
|
||||
int releaseYear = Convert.ToInt32(textMatches[0].Groups[3].Value);
|
||||
int releaseDay = Convert.ToInt32(textMatches[0].Groups[2].Value);
|
||||
|
||||
return new DateTime(releaseYear, releaseMonth, releaseDay);
|
||||
return new(year, month, day);
|
||||
}
|
||||
}
|
||||
@@ -1,13 +1,11 @@
|
||||
using JSMR.Application.Common.Caching;
|
||||
using JSMR.Infrastructure.Common.Locales;
|
||||
using JSMR.Infrastructure.Common.Locales;
|
||||
using JSMR.Infrastructure.Common.SupportedLanguages;
|
||||
using JSMR.Infrastructure.Http;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JSMR.Infrastructure.Scanning;
|
||||
|
||||
public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spamCircleCache)
|
||||
: VoiceWorksScanner(loader, spamCircleCache)
|
||||
public class JapaneseVoiceWorksScanner(IHtmlLoader loader) : VoiceWorksScanner(loader)
|
||||
{
|
||||
protected override ILocale Locale => new JapaneseLocale();
|
||||
|
||||
@@ -21,7 +19,7 @@ public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spam
|
||||
new AlingualLanguage()
|
||||
];
|
||||
|
||||
protected override DateTime? GetEstimatedReleaseDate(string expectedDate)
|
||||
protected override DateOnly? GetEstimatedReleaseDate(string expectedDate)
|
||||
{
|
||||
if (expectedDate.Contains("販売中") || expectedDate.Contains("発売予定未定"))
|
||||
return null;
|
||||
@@ -54,10 +52,10 @@ public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spam
|
||||
break;
|
||||
}
|
||||
|
||||
return new DateTime(releaseYear, releaseMonth, releaseDay);
|
||||
return new DateOnly(releaseYear, releaseMonth, releaseDay);
|
||||
}
|
||||
|
||||
protected override DateTime? GetSalesDate(string salesDate)
|
||||
protected override DateOnly? GetSalesDate(string salesDate)
|
||||
{
|
||||
Regex textRegex = new Regex("販売日: (.*?)年(.*?)月(.*)日", RegexOptions.IgnoreCase);
|
||||
MatchCollection textMatches = textRegex.Matches(salesDate);
|
||||
@@ -69,6 +67,6 @@ public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spam
|
||||
int releaseMonth = Convert.ToInt32(textMatches[0].Groups[2].Value);
|
||||
int releaseDay = Convert.ToInt32(textMatches[0].Groups[3].Value);
|
||||
|
||||
return new DateTime(releaseYear, releaseMonth, releaseDay);
|
||||
return new DateOnly(releaseYear, releaseMonth, releaseDay);
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,4 @@
|
||||
using HtmlAgilityPack;
|
||||
using JSMR.Application.Common.Caching;
|
||||
using JSMR.Application.Scanning;
|
||||
using JSMR.Application.Scanning.Contracts;
|
||||
using JSMR.Application.Scanning.Ports;
|
||||
using JSMR.Infrastructure.Common.Locales;
|
||||
@@ -12,72 +10,57 @@ using System.Text.RegularExpressions;
|
||||
|
||||
namespace JSMR.Infrastructure.Scanning;
|
||||
|
||||
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader, ISpamCircleCache spamCircleCache) : IVoiceWorksScanner
|
||||
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksScanner
|
||||
{
|
||||
protected abstract ILocale Locale { get; }
|
||||
protected abstract ISupportedLanguage[] SupportedLanguages { get; }
|
||||
|
||||
protected abstract DateTime? GetEstimatedReleaseDate(string expectedDate);
|
||||
protected abstract DateTime? GetSalesDate(string salesDate);
|
||||
protected abstract DateOnly? GetEstimatedReleaseDate(string expectedDate);
|
||||
protected abstract DateOnly? GetSalesDate(string salesDate);
|
||||
|
||||
protected virtual bool ExcludeSpamCircles => true;
|
||||
protected virtual bool ExcludePartiallyAIGeneratedWorks => true;
|
||||
protected virtual bool ExcludeAIGeneratedWorks => true;
|
||||
|
||||
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken = default)
|
||||
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken = default)
|
||||
{
|
||||
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(request, cancellationToken);
|
||||
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(options, cancellationToken);
|
||||
List<DLSiteHtmlNode> nodes = document.GetDLSiteNodes();
|
||||
|
||||
return GetDLSiteWorks(nodes);
|
||||
return GetDLSiteWorks(nodes, options);
|
||||
}
|
||||
|
||||
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
|
||||
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken)
|
||||
{
|
||||
string url = await GetUrlAsync(request, cancellationToken);
|
||||
string url = GetUrl(options);
|
||||
|
||||
HtmlDocument document = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken);
|
||||
|
||||
return new DLSiteHtmlDocument(document);
|
||||
}
|
||||
|
||||
protected virtual async ValueTask<string> GetUrlAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
|
||||
protected string GetUrl(VoiceWorkScanOptions options)
|
||||
{
|
||||
DLSiteSearchFilterBuilder filterBuilder = new();
|
||||
var filterBuilder = new DLSiteSearchFilterBuilder()
|
||||
.UseLocale(Locale)
|
||||
.IncludeSupportedLanguages(SupportedLanguages)
|
||||
.ExcludeMakers(options.ExcludedMakerIds);
|
||||
|
||||
foreach (ISupportedLanguage supprotedLanguage in SupportedLanguages)
|
||||
{
|
||||
filterBuilder.IncludeSupportedLanguage(supprotedLanguage);
|
||||
}
|
||||
|
||||
if (ExcludeSpamCircles)
|
||||
{
|
||||
string[] makerIds = await spamCircleCache.GetAsync(cancellationToken);
|
||||
|
||||
foreach (string makerId in makerIds)
|
||||
filterBuilder.ExcludeMaker(makerId);
|
||||
}
|
||||
|
||||
if (ExcludePartiallyAIGeneratedWorks)
|
||||
if (options.ExcludePartiallyAIGeneratedWorks)
|
||||
filterBuilder.ExcludePartiallyAIGeneratedWorks();
|
||||
|
||||
if (ExcludeAIGeneratedWorks)
|
||||
if (options.ExcludeAIGeneratedWorks)
|
||||
filterBuilder.ExcludeAIGeneratedWorks();
|
||||
|
||||
return filterBuilder.BuildSearchQuery(request.PageNumber, request.PageSize);
|
||||
return filterBuilder.BuildSearchQuery(options.PageNumber, options.PageSize);
|
||||
}
|
||||
|
||||
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes)
|
||||
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes, VoiceWorkScanOptions options)
|
||||
{
|
||||
var works = new List<DLSiteWork>();
|
||||
//var spamCircles = SpamCircleCache.Get();
|
||||
|
||||
foreach (DLSiteHtmlNode node in nodes)
|
||||
{
|
||||
DLSiteWork work = GetDLSiteWork(node);
|
||||
|
||||
//if (spamCircles.Any(circle => circle.MakerId == work.MakerId))
|
||||
// continue;
|
||||
if (options.ExcludedMakerIds.Any(makerId => makerId == work.MakerId))
|
||||
continue;
|
||||
|
||||
works.Add(work);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user