using HtmlAgilityPack; using JSMR.Application.Scanning; using JSMR.Application.Scanning.Contracts; using JSMR.Application.Scanning.Ports; using JSMR.Infrastructure.Caching; using JSMR.Infrastructure.Common.Locales; using JSMR.Infrastructure.Common.SupportedLanguages; using JSMR.Infrastructure.Http; using JSMR.Infrastructure.Scanning.Models; using System.Globalization; using System.Text.RegularExpressions; namespace JSMR.Infrastructure.Scanning; public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader, ISpamCircleCache spamCircleCache) : IVoiceWorksScanner { protected abstract ILocale Locale { get; } protected abstract ISupportedLanguage[] SupportedLanguages { get; } protected abstract DateTime? GetEstimatedReleaseDate(string expectedDate); protected abstract DateTime? GetSalesDate(string salesDate); protected virtual bool ExcludeSpamCircles => true; protected virtual bool ExcludePartiallyAIGeneratedWorks => true; protected virtual bool ExcludeAIGeneratedWorks => true; public async Task> ScanPageAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken = default) { DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(request, cancellationToken); List nodes = document.GetDLSiteNodes(); return GetDLSiteWorks(nodes); } private async Task GetDLSiteHtmlCollectionAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken) { string url = await GetUrlAsync(request, cancellationToken); HtmlDocument document = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken); return new DLSiteHtmlDocument(document); } protected virtual async ValueTask GetUrlAsync(ScanVoiceWorksRequest request, CancellationToken cancellationToken) { DLSiteSearchFilterBuilder filterBuilder = new(); foreach (ISupportedLanguage supprotedLanguage in SupportedLanguages) { filterBuilder.IncludeSupportedLanguage(supprotedLanguage); } if (ExcludeSpamCircles) { string[] makerIds = await spamCircleCache.GetAsync(cancellationToken); foreach (string makerId in makerIds) filterBuilder.ExcludeMaker(makerId); } if (ExcludePartiallyAIGeneratedWorks) filterBuilder.ExcludePartiallyAIGeneratedWorks(); if (ExcludeAIGeneratedWorks) filterBuilder.ExcludeAIGeneratedWorks(); return filterBuilder.BuildSearchQuery(request.PageNumber, request.PageSize); } private List GetDLSiteWorks(List nodes) { var works = new List(); //var spamCircles = SpamCircleCache.Get(); foreach (DLSiteHtmlNode node in nodes) { DLSiteWork work = GetDLSiteWork(node); //if (spamCircles.Any(circle => circle.MakerId == work.MakerId)) // continue; works.Add(work); } return works; } private DLSiteWork GetDLSiteWork(DLSiteHtmlNode node) { DLSiteWork work = new(); work.ProductName = ScannerUtilities.GetDecodedText(node.ProductTextNode); work.ProductUrl = node.ProductLinkNode.Attributes["href"].Value; work.ProductId = ScannerUtilities.GetTextBetween(work.ProductUrl, "product_id/", ".html"); work.Maker = ScannerUtilities.GetDecodedText(node.MakerLinkNode); string makerUrl = node.MakerLinkNode.Attributes["href"].Value; work.MakerId = ScannerUtilities.GetTextBetween(makerUrl, "maker_id/", ".html"); work.Description = ScannerUtilities.GetDecodedText(node.DescriptionNode); if (node.ExpectedDateNode != null) { work.ExpectedDate = GetEstimatedReleaseDate(node.ExpectedDateNode.InnerHtml.Trim()); } if (node.SalesDateNode != null) { work.SalesDate = GetSalesDate(node.SalesDateNode.InnerHtml); } if (node.DownloadsNode != null) { work.Downloads = int.Parse(node.DownloadsNode.InnerHtml, NumberStyles.AllowThousands); } var rating = GetScannedRating(node.StarRatingNode); if (rating != null) { work.StarRating = rating.Score; work.Votes = rating.Votes; } work.Genres = ScannerUtilities.GetStringListFromNodes(node.GenreNodes); work.Tags = ScannerUtilities.GetStringListFromNodes(node.SearchTagNodes); work.Creators = ScannerUtilities.GetStringListFromNodes(node.CreatorNodes); string imageSource = ScannerUtilities.GetImageSource(node.ImageNode); string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif"); work.SmallImageUrl = imageSource; work.ImageUrl = imageUrl; work.Type = imageUrl.Contains("ana/doujin") ? "Ana" : "Work"; return work; } private static ScannedRating? GetScannedRating(HtmlNode starRatingNode) { if (starRatingNode == null) return null; string voteText = starRatingNode.InnerText; string? ratingClass = starRatingNode.GetClasses().FirstOrDefault(classNames => classNames.Contains("star_") && classNames != "star_rating"); if (string.IsNullOrEmpty(ratingClass)) return null; Regex votesRegex = new Regex(@"\((.*?)\)", RegexOptions.IgnoreCase); MatchCollection voteMatches = votesRegex.Matches(voteText); if (voteMatches.Count == 0 || voteMatches[0].Groups.Count < 2) return null; ScannedRating rating = new ScannedRating(); rating.Score = Convert.ToByte(ratingClass.Replace("star_", "")); rating.Votes = int.Parse(voteMatches[0].Groups[1].Value, NumberStyles.AllowThousands); return rating; } }