From 22fd0e2b1b117e78529c9f562cab79da4c02797e Mon Sep 17 00:00:00 2001 From: Wolfgang Müller Date: Thu, 16 Jan 2025 17:38:48 +0100 Subject: backend/plugins: Add scraper for info.yaml from schale.network --- docs/plugins/builtin.rst | 2 + pyproject.toml | 1 + src/hircine/plugins/scrapers/schale_network.py | 82 ++++++++++++++++++++++++ tests/plugins/scrapers/test_schale_network.py | 88 ++++++++++++++++++++++++++ 4 files changed, 173 insertions(+) create mode 100644 src/hircine/plugins/scrapers/schale_network.py create mode 100644 tests/plugins/scrapers/test_schale_network.py diff --git a/docs/plugins/builtin.rst b/docs/plugins/builtin.rst index 61d531f..7b815ce 100644 --- a/docs/plugins/builtin.rst +++ b/docs/plugins/builtin.rst @@ -14,3 +14,5 @@ Scrapers .. autoclass:: hircine.plugins.scrapers.ehentai_api.EHentaiAPIScraper() .. autoclass:: hircine.plugins.scrapers.anchira.AnchiraYamlScraper() + +.. autoclass:: hircine.plugins.scrapers.schale_network.SchaleNetworkScraper() diff --git a/pyproject.toml b/pyproject.toml index 20861dc..f83359b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ hircine = 'hircine.cli:main' gallery_dl = "hircine.plugins.scrapers.gallery_dl:GalleryDLScraper" ehentai_api = "hircine.plugins.scrapers.ehentai_api:EHentaiAPIScraper" anchira_yaml = "hircine.plugins.scrapers.anchira:AnchiraYamlScraper" +schale_network_yaml = "hircine.plugins.scrapers.schale_network:SchaleNetworkScraper" [tool.poetry.dependencies] python = "^3.12" diff --git a/src/hircine/plugins/scrapers/schale_network.py b/src/hircine/plugins/scrapers/schale_network.py new file mode 100644 index 0000000..e38cfe8 --- /dev/null +++ b/src/hircine/plugins/scrapers/schale_network.py @@ -0,0 +1,82 @@ +import re + +import yaml + +import hircine.enums as enums +from hircine.scraper import Scraper +from hircine.scraper.types import ( + Artist, + Censorship, + Circle, + Direction, + Language, + Tag, + Title, +) +from hircine.scraper.utils import open_archive_file, parse_dict + +SOURCE_REGEX = re.compile(r"^SchaleNetwork:") + + +class SchaleNetworkScraper(Scraper): + """ + A scraper for ``info.yaml`` files found in archives downloaded from + *schale.network*. + + .. list-table:: + :align: left + + * - **Requires** + - ``info.yaml`` in the archive or as a sidecar. + * - **Source** + - ``schale.network`` + """ + + name = "schale.network info.yaml" + source = "schale.network" + + def __init__(self, comic): + super().__init__(comic) + + self.data = self.load() + source = self.data.get("source") + + if source and re.match(SOURCE_REGEX, source): + self.is_available = True + + def load(self): + try: + with open_archive_file(self.comic.archive, "info.yaml") as yif: + return yaml.safe_load(yif) + except Exception: + return {} + + def scrape(self): + parsers = { + "title": Title, + "artist": Artist, + "circle": Circle, + "general": Tag.from_string, + "male": lambda s: Tag(namespace="male", tag=s), + "female": lambda s: Tag(namespace="female", tag=s), + "mixed": lambda s: Tag(namespace="mixed", tag=s), + "language": self.parse_language, + "other": self.parse_other, + } + + yield from parse_dict(parsers, self.data) + + yield Direction(enums.Direction.RIGHT_TO_LEFT) + + def parse_language(self, input): + if not input or input in ["translated"]: + return + + return Language.from_name(input) + + def parse_other(self, input): + match input: + case "uncensored": + return Censorship(value=enums.Censorship.NONE) + case _: + return Tag.from_string(input) diff --git a/tests/plugins/scrapers/test_schale_network.py b/tests/plugins/scrapers/test_schale_network.py new file mode 100644 index 0000000..236520b --- /dev/null +++ b/tests/plugins/scrapers/test_schale_network.py @@ -0,0 +1,88 @@ +import os +from zipfile import ZipFile + +import pytest + +import hircine.enums as enums +from hircine.plugins.scrapers.schale_network import SchaleNetworkScraper +from hircine.scraper.types import ( + Artist, + Censorship, + Circle, + Direction, + Language, + Tag, + Title, +) + + +@pytest.fixture +def archive_file(tmpdir): + file = os.path.join(tmpdir, "archive.zip") + + data = """ +source: SchaleNetwork:/g/1/1 +title: 'Example Title' +general: + - example +artist: + - example +circle: + - example +magazine: + - example +male: + - example +female: + - example +mixed: + - example +language: + - english + - translated +other: + - uncensored + - vanilla +""" + + with ZipFile(file, "x") as ziph: + ziph.writestr("info.yaml", data) + + yield file + + +def test_does_scrape(monkeypatch, archive_file, gen_comic): + comic = next(gen_comic) + comic.archive.path = archive_file + + scraper = SchaleNetworkScraper(comic) + + assert scraper.is_available + assert scraper.source == SchaleNetworkScraper.source + assert scraper.name == "schale.network info.yaml" + + assert set(scraper.collect()) == set( + [ + Artist(name="example"), + Circle(name="example"), + Direction(value=enums.Direction.RIGHT_TO_LEFT), + Censorship(value=enums.Censorship.NONE), + Language(value=enums.Language.EN), + Tag(namespace="none", tag="example"), + Tag(namespace="none", tag="vanilla"), + Tag(namespace="male", tag="example"), + Tag(namespace="female", tag="example"), + Tag(namespace="mixed", tag="example"), + Title(value="Example Title"), + ] + ) + + +def test_does_not_scrape_on_error(tmpdir, gen_comic): + comic = next(gen_comic) + comic.archive.path = os.path.join(tmpdir, "nonexistent.zip") + + scraper = SchaleNetworkScraper(comic) + + assert scraper.data == {} + assert not scraper.is_available -- cgit v1.2.3-2-gb3c3