Initial implementation of voice works scanning.
This commit is contained in:
164
JSMR.Infrastructure/Scanning/VoiceWorksScanner.cs
Normal file
164
JSMR.Infrastructure/Scanning/VoiceWorksScanner.cs
Normal file
@@ -0,0 +1,164 @@
|
||||
using HtmlAgilityPack;
|
||||
using JSMR.Application.Scanning;
|
||||
using JSMR.Application.Scanning.Contracts;
|
||||
using JSMR.Application.Scanning.Ports;
|
||||
using JSMR.Infrastructure.Caching;
|
||||
using JSMR.Infrastructure.Common.Locales;
|
||||
using JSMR.Infrastructure.Common.SupportedLanguages;
|
||||
using JSMR.Infrastructure.Http;
|
||||
using JSMR.Infrastructure.Scanning.Models;
|
||||
using System.Globalization;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace JSMR.Infrastructure.Scanning;
|
||||
|
||||
public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader, ISpamCircleCache spamCircleCache) : IVoiceWorksScanner
|
||||
{
|
||||
protected abstract ILocale Locale { get; }
|
||||
protected abstract ISupportedLanguage[] SupportedLanguages { get; }
|
||||
|
||||
protected abstract DateTime? GetEstimatedReleaseDate(string expectedDate);
|
||||
protected abstract DateTime? GetSalesDate(string salesDate);
|
||||
|
||||
protected virtual bool ExcludeSpamCircles => true;
|
||||
protected virtual bool ExcludePartiallyAIGeneratedWorks => true;
|
||||
protected virtual bool ExcludeAIGeneratedWorks => true;
|
||||
|
||||
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken = default)
|
||||
{
|
||||
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(request, cancellationToken);
|
||||
List<DLSiteHtmlNode> nodes = document.GetDLSiteNodes();
|
||||
|
||||
return GetDLSiteWorks(nodes);
|
||||
}
|
||||
|
||||
private async Task<DLSiteHtmlDocument> GetDLSiteHtmlCollectionAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
|
||||
{
|
||||
string url = await GetUrlAsync(request, cancellationToken);
|
||||
|
||||
HtmlDocument document = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken);
|
||||
|
||||
return new DLSiteHtmlDocument(document);
|
||||
}
|
||||
|
||||
protected virtual async ValueTask<string> GetUrlAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken)
|
||||
{
|
||||
DLSiteSearchFilterBuilder filterBuilder = new();
|
||||
|
||||
foreach (ISupportedLanguage supprotedLanguage in SupportedLanguages)
|
||||
{
|
||||
filterBuilder.IncludeSupportedLanguage(supprotedLanguage);
|
||||
}
|
||||
|
||||
if (ExcludeSpamCircles)
|
||||
{
|
||||
string[] makerIds = await spamCircleCache.GetAsync(cancellationToken);
|
||||
|
||||
foreach (string makerId in makerIds)
|
||||
filterBuilder.ExcludeMaker(makerId);
|
||||
}
|
||||
|
||||
if (ExcludePartiallyAIGeneratedWorks)
|
||||
filterBuilder.ExcludePartiallyAIGeneratedWorks();
|
||||
|
||||
if (ExcludeAIGeneratedWorks)
|
||||
filterBuilder.ExcludeAIGeneratedWorks();
|
||||
|
||||
return filterBuilder.BuildSearchQuery(request.PageNumber, request.PageSize);
|
||||
}
|
||||
|
||||
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes)
|
||||
{
|
||||
var works = new List<DLSiteWork>();
|
||||
//var spamCircles = SpamCircleCache.Get();
|
||||
|
||||
foreach (DLSiteHtmlNode node in nodes)
|
||||
{
|
||||
DLSiteWork work = GetDLSiteWork(node);
|
||||
|
||||
//if (spamCircles.Any(circle => circle.MakerId == work.MakerId))
|
||||
// continue;
|
||||
|
||||
works.Add(work);
|
||||
}
|
||||
|
||||
return works;
|
||||
}
|
||||
|
||||
private DLSiteWork GetDLSiteWork(DLSiteHtmlNode node)
|
||||
{
|
||||
DLSiteWork work = new();
|
||||
|
||||
work.ProductName = ScannerUtilities.GetDecodedText(node.ProductTextNode);
|
||||
work.ProductUrl = node.ProductLinkNode.Attributes["href"].Value;
|
||||
work.ProductId = ScannerUtilities.GetTextBetween(work.ProductUrl, "product_id/", ".html");
|
||||
work.Maker = ScannerUtilities.GetDecodedText(node.MakerLinkNode);
|
||||
|
||||
string makerUrl = node.MakerLinkNode.Attributes["href"].Value;
|
||||
work.MakerId = ScannerUtilities.GetTextBetween(makerUrl, "maker_id/", ".html");
|
||||
|
||||
work.Description = ScannerUtilities.GetDecodedText(node.DescriptionNode);
|
||||
|
||||
if (node.ExpectedDateNode != null)
|
||||
{
|
||||
work.ExpectedDate = GetEstimatedReleaseDate(node.ExpectedDateNode.InnerHtml.Trim());
|
||||
}
|
||||
|
||||
if (node.SalesDateNode != null)
|
||||
{
|
||||
work.SalesDate = GetSalesDate(node.SalesDateNode.InnerHtml);
|
||||
}
|
||||
|
||||
if (node.DownloadsNode != null)
|
||||
{
|
||||
work.Downloads = int.Parse(node.DownloadsNode.InnerHtml, NumberStyles.AllowThousands);
|
||||
}
|
||||
|
||||
var rating = GetScannedRating(node.StarRatingNode);
|
||||
|
||||
if (rating != null)
|
||||
{
|
||||
work.StarRating = rating.Score;
|
||||
work.Votes = rating.Votes;
|
||||
}
|
||||
|
||||
work.Genres = ScannerUtilities.GetStringListFromNodes(node.GenreNodes);
|
||||
work.Tags = ScannerUtilities.GetStringListFromNodes(node.SearchTagNodes);
|
||||
work.Creators = ScannerUtilities.GetStringListFromNodes(node.CreatorNodes);
|
||||
|
||||
string imageSource = ScannerUtilities.GetImageSource(node.ImageNode);
|
||||
string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif");
|
||||
|
||||
work.SmallImageUrl = imageSource;
|
||||
work.ImageUrl = imageUrl;
|
||||
work.Type = imageUrl.Contains("ana/doujin") ? "Ana" : "Work";
|
||||
|
||||
return work;
|
||||
}
|
||||
|
||||
private static ScannedRating? GetScannedRating(HtmlNode starRatingNode)
|
||||
{
|
||||
if (starRatingNode == null)
|
||||
return null;
|
||||
|
||||
string voteText = starRatingNode.InnerText;
|
||||
|
||||
string? ratingClass = starRatingNode.GetClasses().FirstOrDefault(classNames =>
|
||||
classNames.Contains("star_") && classNames != "star_rating");
|
||||
|
||||
if (string.IsNullOrEmpty(ratingClass))
|
||||
return null;
|
||||
|
||||
Regex votesRegex = new Regex(@"\((.*?)\)", RegexOptions.IgnoreCase);
|
||||
MatchCollection voteMatches = votesRegex.Matches(voteText);
|
||||
|
||||
if (voteMatches.Count == 0 || voteMatches[0].Groups.Count < 2)
|
||||
return null;
|
||||
|
||||
ScannedRating rating = new ScannedRating();
|
||||
rating.Score = Convert.ToByte(ratingClass.Replace("star_", ""));
|
||||
rating.Votes = int.Parse(voteMatches[0].Groups[1].Value, NumberStyles.AllowThousands);
|
||||
|
||||
return rating;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user