Updated scanner logic, and added initial scanner tests.

This commit is contained in:
2025-09-14 21:12:00 -04:00
parent 39274165cb
commit 646cf41476
16 changed files with 412 additions and 192 deletions

View File

@@ -1,6 +1,4 @@
using HtmlAgilityPack;
using JSMR.Application.Common.Caching;
using JSMR.Application.Scanning;
using JSMR.Application.Scanning.Contracts;
using JSMR.Application.Scanning.Ports;
using JSMR.Infrastructure.Common.Locales;
@@ -12,72 +10,57 @@ using System.Text.RegularExpressions;
namespace JSMR.Infrastructure.Scanning;
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader, ISpamCircleCache spamCircleCache) : IVoiceWorksScanner
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksScanner
{
protected abstract ILocale Locale { get; }
protected abstract ISupportedLanguage[] SupportedLanguages { get; }
protected abstract DateTime? GetEstimatedReleaseDate(string expectedDate);
protected abstract DateTime? GetSalesDate(string salesDate);
protected abstract DateOnly? GetEstimatedReleaseDate(string expectedDate);
protected abstract DateOnly? GetSalesDate(string salesDate);
protected virtual bool ExcludeSpamCircles => true;
protected virtual bool ExcludePartiallyAIGeneratedWorks => true;
protected virtual bool ExcludeAIGeneratedWorks => true;
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken = default)
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken = default)
{
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(request, cancellationToken);
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(options, cancellationToken);
List<DLSiteHtmlNode> nodes = document.GetDLSiteNodes();
return GetDLSiteWorks(nodes);
return GetDLSiteWorks(nodes, options);
}
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken)
{
string url = await GetUrlAsync(request, cancellationToken);
string url = GetUrl(options);
HtmlDocument document = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken);
return new DLSiteHtmlDocument(document);
}
protected virtual async ValueTask<string> GetUrlAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
protected string GetUrl(VoiceWorkScanOptions options)
{
DLSiteSearchFilterBuilder filterBuilder = new();
var filterBuilder = new DLSiteSearchFilterBuilder()
.UseLocale(Locale)
.IncludeSupportedLanguages(SupportedLanguages)
.ExcludeMakers(options.ExcludedMakerIds);
foreach (ISupportedLanguage supprotedLanguage in SupportedLanguages)
{
filterBuilder.IncludeSupportedLanguage(supprotedLanguage);
}
if (ExcludeSpamCircles)
{
string[] makerIds = await spamCircleCache.GetAsync(cancellationToken);
foreach (string makerId in makerIds)
filterBuilder.ExcludeMaker(makerId);
}
if (ExcludePartiallyAIGeneratedWorks)
if (options.ExcludePartiallyAIGeneratedWorks)
filterBuilder.ExcludePartiallyAIGeneratedWorks();
if (ExcludeAIGeneratedWorks)
if (options.ExcludeAIGeneratedWorks)
filterBuilder.ExcludeAIGeneratedWorks();
return filterBuilder.BuildSearchQuery(request.PageNumber, request.PageSize);
return filterBuilder.BuildSearchQuery(options.PageNumber, options.PageSize);
}
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes)
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes, VoiceWorkScanOptions options)
{
var works = new List<DLSiteWork>();
//var spamCircles = SpamCircleCache.Get();
foreach (DLSiteHtmlNode node in nodes)
{
DLSiteWork work = GetDLSiteWork(node);
//if (spamCircles.Any(circle => circle.MakerId == work.MakerId))
// continue;
if (options.ExcludedMakerIds.Any(makerId => makerId == work.MakerId))
continue;
works.Add(work);
}