Updated scanner logic, and added initial scanner tests.

This commit is contained in:
2025-09-14 21:12:00 -04:00
parent 39274165cb
commit 646cf41476
16 changed files with 412 additions and 192 deletions

View File

@@ -34,6 +34,16 @@ public class DLSiteSearchFilterBuilder
return this;
}
public DLSiteSearchFilterBuilder IncludeSupportedLanguages(ISupportedLanguage[] languages)
{
foreach (ISupportedLanguage language in languages)
{
IncludeSupportedLanguage(language);
}
return this;
}
public DLSiteSearchFilterBuilder IncludeSupportedLanguage(ISupportedLanguage language)
{
AddToOptionsAnd(language.Code);

View File

@@ -1,15 +1,14 @@
using JSMR.Application.Common.Caching;
using JSMR.Infrastructure.Common.Locales;
using JSMR.Infrastructure.Common.Locales;
using JSMR.Infrastructure.Common.SupportedLanguages;
using JSMR.Infrastructure.Http;
using System.Globalization;
using System.Text.RegularExpressions;
namespace JSMR.Infrastructure.Scanning;
public partial class EnglishVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spamCircleCache)
: VoiceWorksScanner(loader, spamCircleCache)
public partial class EnglishVoiceWorksScanner(IHtmlLoader loader) : VoiceWorksScanner(loader)
{
[GeneratedRegex(@"Release: (.*?)[/](\d{2})[/](\d{4})", RegexOptions.IgnoreCase, "en-US")]
[GeneratedRegex(@"Release date: (.*?)[/](\d{1,2})[/](\d{4})", RegexOptions.IgnoreCase, "en-US")]
private static partial Regex SalesDateRegex();
[GeneratedRegex(@"^(Early|Middle|Late)\s(.*?)\s(\d{4})", RegexOptions.IgnoreCase, "en-US")]
@@ -24,140 +23,47 @@ public partial class EnglishVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCac
new AlingualLanguage()
];
protected override DateTime? GetEstimatedReleaseDate(string expectedDate)
protected override DateOnly? GetEstimatedReleaseDate(string expectedDate)
{
if (expectedDate.Contains("販売中") || expectedDate.Contains("発売予定未定"))
if (expectedDate.Contains("Release Date: TBC", StringComparison.OrdinalIgnoreCase))
return null;
Regex textRegex = EstimatedDateRegex();
MatchCollection textMatches = textRegex.Matches(expectedDate);
Match match = EstimatedDateRegex().Match(expectedDate);
if (textMatches.Count == 0 || textMatches[0].Groups.Count < 4)
if (match.Success == false)
return null;
GroupCollection groups = textMatches[0].Groups;
GroupCollection groups = match.Groups;
int releaseYear = Convert.ToInt32(groups[3].Value);
int releaseMonth = 1;
int releaseDay = 1;
string releaseTime = groups[1].Value;
string releaseMonthText = groups[2].Value;
switch (releaseTime)
int day = groups[1].Value.ToLowerInvariant() switch
{
case "Early":
releaseDay = 1;
break;
case "Middle":
releaseDay = 11;
break;
case "Late":
releaseDay = 21;
break;
}
"early" => 1,
"middle" => 11,
"late" => 21,
_ => 1
};
switch (releaseMonthText)
{
case "Jan.":
releaseMonth = 1;
break;
case "Feb.":
releaseMonth = 2;
break;
case "Mar.":
releaseMonth = 3;
break;
case "Apr.":
releaseMonth = 4;
break;
case "May.":
releaseMonth = 5;
break;
case "Jun.":
releaseMonth = 6;
break;
case "Jul.":
releaseMonth = 7;
break;
case "Aug.":
releaseMonth = 8;
break;
case "Sep.":
releaseMonth = 9;
break;
case "Oct.":
releaseMonth = 10;
break;
case "Nov.":
releaseMonth = 11;
break;
case "Dec.":
releaseMonth = 12;
break;
}
string monthAbbreviation = groups[2].Value.Replace(".", "");
int month = DateTime.ParseExact(monthAbbreviation, "MMM", CultureInfo.InvariantCulture).Month;
return new DateTime(releaseYear, releaseMonth, releaseDay);
int year = Convert.ToInt32(groups[3].Value);
return new DateOnly(year, month, day);
}
protected override DateTime? GetSalesDate(string salesDate)
protected override DateOnly? GetSalesDate(string salesDate)
{
Regex textRegex = SalesDateRegex();
MatchCollection textMatches = textRegex.Matches(salesDate);
Match match = SalesDateRegex().Match(salesDate);
if (textMatches.Count == 0 || textMatches[0].Groups.Count < 4)
if (match.Success == false)
return null;
string month = textMatches[0].Groups[1].Value;
int releaseMonth = -1;
string monthAbbreviation = match.Groups[1].Value;
int day = int.Parse(match.Groups[2].Value);
int year = int.Parse(match.Groups[3].Value);
switch (month)
{
case "Jan":
releaseMonth = 1;
break;
case "Feb":
releaseMonth = 2;
break;
case "Mar":
releaseMonth = 3;
break;
case "Apr":
releaseMonth = 4;
break;
case "May":
releaseMonth = 5;
break;
case "Jun":
releaseMonth = 6;
break;
case "Jul":
releaseMonth = 7;
break;
case "Aug":
releaseMonth = 8;
break;
case "Sep":
releaseMonth = 9;
break;
case "Oct":
releaseMonth = 10;
break;
case "Nov":
releaseMonth = 11;
break;
case "Dec":
releaseMonth = 12;
break;
}
int month = DateTime.ParseExact(monthAbbreviation, "MMM", CultureInfo.InvariantCulture).Month;
if (releaseMonth == -1)
return null;
int releaseYear = Convert.ToInt32(textMatches[0].Groups[3].Value);
int releaseDay = Convert.ToInt32(textMatches[0].Groups[2].Value);
return new DateTime(releaseYear, releaseMonth, releaseDay);
return new(year, month, day);
}
}

View File

@@ -1,13 +1,11 @@
using JSMR.Application.Common.Caching;
using JSMR.Infrastructure.Common.Locales;
using JSMR.Infrastructure.Common.Locales;
using JSMR.Infrastructure.Common.SupportedLanguages;
using JSMR.Infrastructure.Http;
using System.Text.RegularExpressions;
namespace JSMR.Infrastructure.Scanning;
public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spamCircleCache)
: VoiceWorksScanner(loader, spamCircleCache)
public class JapaneseVoiceWorksScanner(IHtmlLoader loader) : VoiceWorksScanner(loader)
{
protected override ILocale Locale => new JapaneseLocale();
@@ -21,7 +19,7 @@ public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spam
new AlingualLanguage()
];
protected override DateTime? GetEstimatedReleaseDate(string expectedDate)
protected override DateOnly? GetEstimatedReleaseDate(string expectedDate)
{
if (expectedDate.Contains("販売中") || expectedDate.Contains("発売予定未定"))
return null;
@@ -54,10 +52,10 @@ public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spam
break;
}
return new DateTime(releaseYear, releaseMonth, releaseDay);
return new DateOnly(releaseYear, releaseMonth, releaseDay);
}
protected override DateTime? GetSalesDate(string salesDate)
protected override DateOnly? GetSalesDate(string salesDate)
{
Regex textRegex = new Regex("販売日:&nbsp;(.*?)年(.*?)月(.*)日", RegexOptions.IgnoreCase);
MatchCollection textMatches = textRegex.Matches(salesDate);
@@ -69,6 +67,6 @@ public class JapaneseVoiceWorksScanner(IHtmlLoader loader, ISpamCircleCache spam
int releaseMonth = Convert.ToInt32(textMatches[0].Groups[2].Value);
int releaseDay = Convert.ToInt32(textMatches[0].Groups[3].Value);
return new DateTime(releaseYear, releaseMonth, releaseDay);
return new DateOnly(releaseYear, releaseMonth, releaseDay);
}
}

View File

@@ -1,6 +1,4 @@
using HtmlAgilityPack;
using JSMR.Application.Common.Caching;
using JSMR.Application.Scanning;
using JSMR.Application.Scanning.Contracts;
using JSMR.Application.Scanning.Ports;
using JSMR.Infrastructure.Common.Locales;
@@ -12,72 +10,57 @@ using System.Text.RegularExpressions;
namespace JSMR.Infrastructure.Scanning;
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader, ISpamCircleCache spamCircleCache) : IVoiceWorksScanner
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksScanner
{
protected abstract ILocale Locale { get; }
protected abstract ISupportedLanguage[] SupportedLanguages { get; }
protected abstract DateTime? GetEstimatedReleaseDate(string expectedDate);
protected abstract DateTime? GetSalesDate(string salesDate);
protected abstract DateOnly? GetEstimatedReleaseDate(string expectedDate);
protected abstract DateOnly? GetSalesDate(string salesDate);
protected virtual bool ExcludeSpamCircles => true;
protected virtual bool ExcludePartiallyAIGeneratedWorks => true;
protected virtual bool ExcludeAIGeneratedWorks => true;
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken = default)
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken = default)
{
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(request, cancellationToken);
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(options, cancellationToken);
List<DLSiteHtmlNode> nodes = document.GetDLSiteNodes();
return GetDLSiteWorks(nodes);
return GetDLSiteWorks(nodes, options);
}
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken)
{
string url = await GetUrlAsync(request, cancellationToken);
string url = GetUrl(options);
HtmlDocument document = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken);
return new DLSiteHtmlDocument(document);
}
protected virtual async ValueTask<string> GetUrlAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
protected string GetUrl(VoiceWorkScanOptions options)
{
DLSiteSearchFilterBuilder filterBuilder = new();
var filterBuilder = new DLSiteSearchFilterBuilder()
.UseLocale(Locale)
.IncludeSupportedLanguages(SupportedLanguages)
.ExcludeMakers(options.ExcludedMakerIds);
foreach (ISupportedLanguage supprotedLanguage in SupportedLanguages)
{
filterBuilder.IncludeSupportedLanguage(supprotedLanguage);
}
if (ExcludeSpamCircles)
{
string[] makerIds = await spamCircleCache.GetAsync(cancellationToken);
foreach (string makerId in makerIds)
filterBuilder.ExcludeMaker(makerId);
}
if (ExcludePartiallyAIGeneratedWorks)
if (options.ExcludePartiallyAIGeneratedWorks)
filterBuilder.ExcludePartiallyAIGeneratedWorks();
if (ExcludeAIGeneratedWorks)
if (options.ExcludeAIGeneratedWorks)
filterBuilder.ExcludeAIGeneratedWorks();
return filterBuilder.BuildSearchQuery(request.PageNumber, request.PageSize);
return filterBuilder.BuildSearchQuery(options.PageNumber, options.PageSize);
}
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes)
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes, VoiceWorkScanOptions options)
{
var works = new List<DLSiteWork>();
//var spamCircles = SpamCircleCache.Get();
foreach (DLSiteHtmlNode node in nodes)
{
DLSiteWork work = GetDLSiteWork(node);
//if (spamCircles.Any(circle => circle.MakerId == work.MakerId))
// continue;
if (options.ExcludedMakerIds.Any(makerId => makerId == work.MakerId))
continue;
works.Add(work);
}