Files
jsmr/JSMR.Infrastructure/Scanning/VoiceWorksScanner.cs

164 lines
5.8 KiB
C#

using HtmlAgilityPack;
using JSMR.Application.Common.Caching;
using JSMR.Application.Scanning;
using JSMR.Application.Scanning.Contracts;
using JSMR.Application.Scanning.Ports;
using JSMR.Infrastructure.Common.Locales;
using JSMR.Infrastructure.Common.SupportedLanguages;
using JSMR.Infrastructure.Http;
using JSMR.Infrastructure.Scanning.Models;
using System.Globalization;
using System.Text.RegularExpressions;
namespace JSMR.Infrastructure.Scanning;
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader, ISpamCircleCache spamCircleCache) : IVoiceWorksScanner
{
protected abstract ILocale Locale { get; }
protected abstract ISupportedLanguage[] SupportedLanguages { get; }
protected abstract DateTime? GetEstimatedReleaseDate(string expectedDate);
protected abstract DateTime? GetSalesDate(string salesDate);
protected virtual bool ExcludeSpamCircles => true;
protected virtual bool ExcludePartiallyAIGeneratedWorks => true;
protected virtual bool ExcludeAIGeneratedWorks => true;
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken = default)
{
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(request, cancellationToken);
List<DLSiteHtmlNode> nodes = document.GetDLSiteNodes();
return GetDLSiteWorks(nodes);
}
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
{
string url = await GetUrlAsync(request, cancellationToken);
HtmlDocument document = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken);
return new DLSiteHtmlDocument(document);
}
protected virtual async ValueTask<string> GetUrlAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
{
DLSiteSearchFilterBuilder filterBuilder = new();
foreach (ISupportedLanguage supprotedLanguage in SupportedLanguages)
{
filterBuilder.IncludeSupportedLanguage(supprotedLanguage);
}
if (ExcludeSpamCircles)
{
string[] makerIds = await spamCircleCache.GetAsync(cancellationToken);
foreach (string makerId in makerIds)
filterBuilder.ExcludeMaker(makerId);
}
if (ExcludePartiallyAIGeneratedWorks)
filterBuilder.ExcludePartiallyAIGeneratedWorks();
if (ExcludeAIGeneratedWorks)
filterBuilder.ExcludeAIGeneratedWorks();
return filterBuilder.BuildSearchQuery(request.PageNumber, request.PageSize);
}
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes)
{
var works = new List<DLSiteWork>();
//var spamCircles = SpamCircleCache.Get();
foreach (DLSiteHtmlNode node in nodes)
{
DLSiteWork work = GetDLSiteWork(node);
//if (spamCircles.Any(circle => circle.MakerId == work.MakerId))
// continue;
works.Add(work);
}
return works;
}
private DLSiteWork GetDLSiteWork(DLSiteHtmlNode node)
{
DLSiteWork work = new();
work.ProductName = ScannerUtilities.GetDecodedText(node.ProductTextNode);
work.ProductUrl = node.ProductLinkNode.Attributes["href"].Value;
work.ProductId = ScannerUtilities.GetTextBetween(work.ProductUrl, "product_id/", ".html");
work.Maker = ScannerUtilities.GetDecodedText(node.MakerLinkNode);
string makerUrl = node.MakerLinkNode.Attributes["href"].Value;
work.MakerId = ScannerUtilities.GetTextBetween(makerUrl, "maker_id/", ".html");
work.Description = ScannerUtilities.GetDecodedText(node.DescriptionNode);
if (node.ExpectedDateNode != null)
{
work.ExpectedDate = GetEstimatedReleaseDate(node.ExpectedDateNode.InnerHtml.Trim());
}
if (node.SalesDateNode != null)
{
work.SalesDate = GetSalesDate(node.SalesDateNode.InnerHtml);
}
if (node.DownloadsNode != null)
{
work.Downloads = int.Parse(node.DownloadsNode.InnerHtml, NumberStyles.AllowThousands);
}
var rating = GetScannedRating(node.StarRatingNode);
if (rating != null)
{
work.StarRating = rating.Score;
work.Votes = rating.Votes;
}
work.Genres = ScannerUtilities.GetStringListFromNodes(node.GenreNodes);
work.Tags = ScannerUtilities.GetStringListFromNodes(node.SearchTagNodes);
work.Creators = ScannerUtilities.GetStringListFromNodes(node.CreatorNodes);
string imageSource = ScannerUtilities.GetImageSource(node.ImageNode);
string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif");
work.SmallImageUrl = imageSource;
work.ImageUrl = imageUrl;
work.Type = imageUrl.Contains("ana/doujin") ? "Ana" : "Work";
return work;
}
private static ScannedRating? GetScannedRating(HtmlNode starRatingNode)
{
if (starRatingNode == null)
return null;
string voteText = starRatingNode.InnerText;
string? ratingClass = starRatingNode.GetClasses().FirstOrDefault(classNames =>
classNames.Contains("star_") && classNames != "star_rating");
if (string.IsNullOrEmpty(ratingClass))
return null;
Regex votesRegex = new Regex(@"\((.*?)\)", RegexOptions.IgnoreCase);
MatchCollection voteMatches = votesRegex.Matches(voteText);
if (voteMatches.Count == 0 || voteMatches[0].Groups.Count < 2)
return null;
ScannedRating rating = new ScannedRating();
rating.Score = Convert.ToByte(ratingClass.Replace("star_", ""));
rating.Votes = int.Parse(voteMatches[0].Groups[1].Value, NumberStyles.AllowThousands);
return rating;
}
}