Updated scanner logic to handle thumb VueJS components. Removed uneeded DLSiteWork fields.
All checks were successful
ci / build-test (push) Successful in 2m21s
ci / publish-image (push) Has been skipped

This commit is contained in:
2026-02-28 22:20:24 -05:00
parent ca7ffa1730
commit 704a6fc433
9 changed files with 316 additions and 40 deletions

View File

@@ -2,37 +2,26 @@
namespace JSMR.Infrastructure.Scanning.Models;
public class DLSiteHtmlDocument
public class DLSiteHtmlDocument(HtmlDocument document)
{
private readonly HtmlNodeCollection _workColumns;
private readonly HtmlNodeCollection _workColumnRights;
private readonly HtmlNodeCollection _workThumbs;
private readonly HtmlNodeCollection _workColumns = document.DocumentNode.SelectNodes("//dl[@class='work_1col']");
private readonly HtmlNodeCollection _workColumnRights = document.DocumentNode.SelectNodes("//td[contains(@class, 'work_1col_right')]");
private readonly HtmlNodeCollection _workThumbs = document.DocumentNode.SelectNodes("//div[@class='work_thumb']");
public HtmlNode PageTotalNode { get; } = document.DocumentNode.SelectNodes("//div[@class='page_total']/strong")[0];
public HtmlNode PageTotalNode { get; }
public DLSiteHtmlDocument(HtmlDocument document)
public DLSiteHtmlNode[] GetDLSiteNodes()
{
_workColumns = document.DocumentNode.SelectNodes("//dl[@class='work_1col']");
//_workColumnRights = document.DocumentNode.SelectNodes("//td[@class='work_1col_right']");
_workColumnRights = document.DocumentNode.SelectNodes("//td[contains(@class, 'work_1col_right')]");
_workThumbs = document.DocumentNode.SelectNodes("//div[@class='work_thumb']");
PageTotalNode = document.DocumentNode.SelectNodes("//div[@class='page_total']/strong")[0];
}
public List<DLSiteHtmlNode> GetDLSiteNodes()
{
var nodes = new List<DLSiteHtmlNode>();
List<DLSiteHtmlNode> nodes = [];
if (_workColumns.Count != _workColumnRights.Count || _workColumns.Count != _workThumbs.Count)
throw new Exception("Work column node counts do not match!");
for (int i = 0; i < _workColumns.Count; i++)
{
var node = new DLSiteHtmlNode(_workColumns[i], _workColumnRights[i], _workThumbs[i]);
DLSiteHtmlNode node = new(_workColumns[i], _workColumnRights[i], _workThumbs[i]);
nodes.Add(node);
}
return nodes;
return [.. nodes];
}
}

View File

@@ -19,7 +19,8 @@ public class DLSiteHtmlNode
public HtmlNode? SalesDateNode { get; private set; }
public HtmlNode DownloadsNode { get; private set; }
public HtmlNode? StarRatingNode { get; private set; }
public HtmlNode ImageNode { get; private set; }
public HtmlNode? ImageNode { get; private set; }
public HtmlNode? ThumbWithNgFilterBlockNode { get; private set; }
public HtmlNode[] GenreNodes { get; private set; }
public HtmlNode[] SearchTagNodes { get; private set; }
public HtmlNode[] CreatorNodes { get; private set; }
@@ -55,7 +56,8 @@ public class DLSiteHtmlNode
//InitializeSalesAndDownloadsNodes();
StarRatingNode = GetStarRatingNode();
ImageNode = GetImageNode();
ImageNode = TryGetImageNode();
ThumbWithNgFilterBlockNode = ThumbNode.SelectSingleNode(".//thumb-with-ng-filter-block");
}
private HtmlNode[] GetGenreNodes()
@@ -165,10 +167,13 @@ public class DLSiteHtmlNode
// }
//}
private HtmlNode GetImageNode()
private HtmlNode? TryGetImageNode()
{
HtmlNode linkNode = ThumbNode.SelectNodes(".//a")[0];
HtmlNode? linkNode = ThumbNode.SelectSingleNode(".//a");
return linkNode.SelectNodes(".//img")[0];
if (linkNode is null)
return null;
return linkNode.SelectSingleNode(".//img");
}
}

View File

@@ -1,4 +1,5 @@
using HtmlAgilityPack;
using System.Text.Json;
using System.Web;
namespace JSMR.Infrastructure.Scanning;
@@ -45,4 +46,26 @@ public static class ScannerUtilities
return imageSource;
}
public static string[] ParseJavaScriptArray(string value)
{
try
{
string json = NormalizeJavaScriptArray(value);
return JsonSerializer.Deserialize<string[]>(json) ?? [];
}
catch
{
return [.. value
.Trim('[', ']')
.Split(',', StringSplitOptions.RemoveEmptyEntries)
.Select(x => x.Trim().Trim('\'', '"'))];
}
}
private static string NormalizeJavaScriptArray(string input)
{
return input.Trim().Replace('\'', '"');
}
}

View File

@@ -7,6 +7,7 @@ using JSMR.Domain.ValueObjects;
using JSMR.Infrastructure.Http;
using JSMR.Infrastructure.Scanning.Models;
using System.Globalization;
using System.Text.Json;
using System.Text.RegularExpressions;
namespace JSMR.Infrastructure.Scanning;
@@ -22,7 +23,7 @@ public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksSca
public async Task<IReadOnlyList<DLSiteWork>> ScanPageAsync(VoiceWorkScanOptions options, CancellationToken cancellationToken = default)
{
DLSiteHtmlDocument document = await GetDLSiteHtmlCollectionAsync(options, cancellationToken);
List<DLSiteHtmlNode> nodes = document.GetDLSiteNodes();
DLSiteHtmlNode[] nodes = document.GetDLSiteNodes();
return GetDLSiteWorks(nodes, options);
}
@@ -52,7 +53,7 @@ public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksSca
return filterBuilder.BuildSearchQuery(options.PageNumber, options.PageSize);
}
private List<DLSiteWork> GetDLSiteWorks(List<DLSiteHtmlNode> nodes, VoiceWorkScanOptions options)
private List<DLSiteWork> GetDLSiteWorks(DLSiteHtmlNode[] nodes, VoiceWorkScanOptions options)
{
var works = new List<DLSiteWork>();
@@ -73,8 +74,7 @@ public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksSca
{
string productUrl = node.ProductLinkNode.Attributes["href"].Value;
string makerUrl = node.MakerLinkNode.Attributes["href"].Value;
string imageSource = ScannerUtilities.GetImageSource(node.ImageNode);
string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif");
(string imageSource, string imageUrl) = TryGetImageSourceAndUrl(node);
ScannedRating? rating = GetScannedRating(node.StarRatingNode);
DLSiteWork work = new()
@@ -89,7 +89,7 @@ public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksSca
Creators = ScannerUtilities.GetStringListFromNodes(node.CreatorNodes),
SmallImageUrl = imageSource,
ImageUrl = imageUrl,
Type = imageUrl.Contains("ana/doujin") ? DLSiteWorkType.Announced : DLSiteWorkType.Released,
//Type = imageUrl.Contains("ana/doujin") ? DLSiteWorkType.Announced : DLSiteWorkType.Released,
StarRating = rating?.Score,
Votes = rating?.Votes,
AgeRating = GetAgeRating(node.GenreNodes)
@@ -113,6 +113,36 @@ public abstract class VoiceWorksScanner(IHtmlLoader htmlLoader) : IVoiceWorksSca
return work;
}
private static (string, string) TryGetImageSourceAndUrl(DLSiteHtmlNode node)
{
if (node.ThumbWithNgFilterBlockNode is not null)
{
string candidates = node.ThumbWithNgFilterBlockNode.GetAttributeValue(":thumb-candidates", string.Empty);
string[] imageUrls = ScannerUtilities.ParseJavaScriptArray(candidates);
if (imageUrls.Length == 0)
{
throw new Exception("No thumb candidartes found");
}
string imageSource = imageUrls[0];
string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif");
return (imageSource, imageUrl);
}
else if (node.ImageNode is not null)
{
string imageSource = ScannerUtilities.GetImageSource(node.ImageNode);
string imageUrl = imageSource.Replace("_sam.jpg", "_main.jpg").Replace("_sam.gif", "_main.gif");
return (imageSource, imageUrl);
}
else
{
throw new Exception("Unable to find image source and/or url");
}
}
private static AgeRating GetAgeRating(HtmlNode[] genreNodes)
{
List<string> genres = ScannerUtilities.GetStringListFromNodes(genreNodes);