using HtmlAgilityPack; using JSMR.Application.Enums; using JSMR.Application.Scanning.Contracts; using JSMR.Application.Scanning.Ports; using JSMR.Domain.Enums; using JSMR.Domain.ValueObjects; using JSMR.Infrastructure.Http; using JSMR.Infrastructure.Scanning.Extensions; using JSMR.Infrastructure.Scanning.Models; using System.Globalization; using System.Net; using System.Text.RegularExpressions; namespace JSMR.Infrastructure.Scanning; public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksScanner { protected abstract Locale Locale { get; } protected abstract SupportedLanguage[] SupportedLanguages { get; } protected abstract DateOnly? GetEstimatedReleaseDate(string expectedDate); protected abstract DateOnly? GetSalesDate(string salesDate); public async Task ScanPageAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken = default) { string url = GetUrl(options); HtmlLoadResult result = await htmlLoader.GetHtmlDocumentAsync(url, cancellationToken); // Expected boundary: past the last search page if (result.StatusCode == HttpStatusCode.NotFound) { return new VoiceWorkScanResult( Works: [], EndOfResults: true ); } // Unexpected non-success response if (!result.IsSuccessStatusCode || result.Document is null) { throw new HttpRequestException( $"Unexpected response status code {(int)result.StatusCode} ({result.StatusCode}) while scanning {url}"); } DLSiteHtmlDocument document = new(result.Document); DLSiteHtmlNode[] nodes = document.GetDLSiteNodes(); // Defensive fallback in case DLsite changes from 404 to 200 with empty page if (nodes.Length == 0) { return new VoiceWorkScanResult( Works: [], EndOfResults: true ); } DLSiteWork[] works = GetDLSiteWorks(nodes, options); works.InferAndUpdateExpectedDates(); return new VoiceWorkScanResult( Works: works, EndOfResults: false ); } protected string GetUrl(VoiceWorkScanOptions options) { var filterBuilder = new DLSiteSearchFilterBuilder() .UseLocale(Locale) .IncludeSupportedLanguages(SupportedLanguages) .ExcludeMakers(options.ExcludedMakerIds); if (options.ExcludePartiallyAIGeneratedWorks) filterBuilder.ExcludePartiallyAIGeneratedWorks(); if (options.ExcludeAIGeneratedWorks) filterBuilder.ExcludeAIGeneratedWorks(); return filterBuilder.BuildSearchQuery(options.PageNumber, options.PageSize); } private DLSiteWork[] GetDLSiteWorks(DLSiteHtmlNode[] nodes, VoiceWorkScanOptions options) { var works = new List(); foreach (DLSiteHtmlNode node in nodes) { DLSiteWork work = GetDLSiteWork(node); if (options.ExcludedMakerIds.Any(makerId => makerId == work.MakerId)) continue; works.Add(work); } return [.. works]; } private DLSiteWork GetDLSiteWork(DLSiteHtmlNode node) { string productUrl = node.ProductLinkNode.Attributes["href"].Value; string makerUrl = node.MakerLinkNode.Attributes["href"].Value; (string imageSource, string imageUrl) = TryGetImageSourceAndUrl(node); ScannedRating? rating = GetScannedRating(node.StarRatingNode); DLSiteWork work = new() { ProductName = ScannerUtilities.GetDecodedText(node.ProductTextNode), Description = ScannerUtilities.GetDecodedText(node.DescriptionNode), ProductId = ScannerUtilities.GetTextBetween(productUrl, "product_id/", ".html"), Maker = ScannerUtilities.GetDecodedText(node.MakerLinkNode), MakerId = ScannerUtilities.GetTextBetween(makerUrl, "maker_id/", ".html"), Genres = ScannerUtilities.GetStringListFromNodes(node.GenreNodes), Tags = ScannerUtilities.GetStringListFromNodes(node.SearchTagNodes), Creators = ScannerUtilities.GetStringListFromNodes(node.CreatorNodes), SmallImageUrl = imageSource, ImageUrl = imageUrl, //Type = imageUrl.Contains("ana/doujin") ? DLSiteWorkType.Announced : DLSiteWorkType.Released, StarRating = rating?.Score, Votes = rating?.Votes, AgeRating = GetAgeRating(node.GenreNodes) }; if (node.ExpectedDateNode != null) { work.ExpectedDate = GetEstimatedReleaseDate(node.ExpectedDateNode.InnerHtml.Trim()); } if (node.SalesDateNode != null) { work.SalesDate = GetSalesDate(node.SalesDateNode.InnerHtml); } if (node.DownloadsNode != null) { work.Downloads = int.Parse(node.DownloadsNode.InnerHtml, NumberStyles.AllowThousands); } return work; } private static (string, string) TryGetImageSourceAndUrl(DLSiteHtmlNode node) { if (node.ThumbWithNgFilterBlockNode is not null) { string candidates = node.ThumbWithNgFilterBlockNode.GetAttributeValue(":thumb-candidates", string.Empty); string[] imageUrls = ScannerUtilities.ParseJavaScriptArray(candidates); if (imageUrls.Length == 0) { throw new Exception("No thumb candidartes found"); } string imageSource = imageUrls[0]; string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif"); return (imageSource, imageUrl); } else if (node.ImageNode is not null) { string imageSource = ScannerUtilities.GetImageSource(node.ImageNode); string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif"); return (imageSource, imageUrl); } else { throw new Exception("Unable to find image source and/or url"); } } private static AgeRating GetAgeRating(HtmlNode[] genreNodes) { List genres = ScannerUtilities.GetStringListFromNodes(genreNodes); if (genres.Contains("全年齢")) return AgeRating.AllAges; if (genres.Contains("R-15")) return AgeRating.R15; return AgeRating.R18; } private static ScannedRating? GetScannedRating(HtmlNode? starRatingNode) { if (starRatingNode == null) return null; string voteText = starRatingNode.InnerText; string? ratingClass = starRatingNode.GetClasses().FirstOrDefault(classNames => classNames.Contains("star_") && classNames != "star_rating"); if (string.IsNullOrEmpty(ratingClass)) return null; Regex votesRegex = new Regex(@"\((.*?)\)", RegexOptions.IgnoreCase); MatchCollection voteMatches = votesRegex.Matches(voteText); if (voteMatches.Count == 0 || voteMatches[0].Groups.Count < 2) return null; ScannedRating rating = new() { Score = Convert.ToByte(ratingClass.Replace("star_", "")), Votes = int.Parse(voteMatches[0].Groups[1].Value, NumberStyles.AllowThousands) }; return rating; } }