diff options
author | Wolfgang Müller | 2024-03-05 18:08:09 +0100 |
---|---|---|
committer | Wolfgang Müller | 2024-03-05 19:25:59 +0100 |
commit | d1d654ebac2d51e3841675faeb56480e440f622f (patch) | |
tree | 56ef123c1a15a10dfd90836e4038e27efde950c6 /src/hircine/plugins | |
download | hircine-d1d654ebac2d51e3841675faeb56480e440f622f.tar.gz |
Initial commit0.1.0
Diffstat (limited to 'src/hircine/plugins')
-rw-r--r-- | src/hircine/plugins/__init__.py | 49 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/__init__.py | 0 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/anchira.py | 101 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/ehentai_api.py | 75 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/gallery_dl.py | 54 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/handlers/__init__.py | 0 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/handlers/dynastyscans.py | 41 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/handlers/e621.py | 81 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/handlers/exhentai.py | 139 | ||||
-rw-r--r-- | src/hircine/plugins/scrapers/handlers/mangadex.py | 54 |
10 files changed, 594 insertions, 0 deletions
diff --git a/src/hircine/plugins/__init__.py b/src/hircine/plugins/__init__.py new file mode 100644 index 0000000..27e55a7 --- /dev/null +++ b/src/hircine/plugins/__init__.py @@ -0,0 +1,49 @@ +from importlib.metadata import entry_points +from typing import Dict, Type + +from hircine.scraper import Scraper + +scraper_registry: Dict[str, Type[Scraper]] = {} +transformers = [] + + +def get_scraper(name): + return scraper_registry.get(name, None) + + +def get_scrapers(): + return scraper_registry.items() + + +def register_scraper(name, cls): + scraper_registry[name] = cls + + +def transformer(function): + """ + Marks the decorated function as a transformer. + + The decorated function must be a generator function that yields + :ref:`scraped-data`. The following parameters will be available to the + decorated function: + + :param generator: The scraper's generator function. + :param ScraperInfo info: Information on the scraper. + """ + + def _decorate(function): + transformers.append(function) + return function + + return _decorate(function) + + +def load(): # pragma: nocover + for entry in entry_points(group="hircine.scraper"): + register_scraper(entry.name, entry.load()) + + for entry in entry_points(group="hircine.transformer"): + entry.load() + + +load() diff --git a/src/hircine/plugins/scrapers/__init__.py b/src/hircine/plugins/scrapers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/hircine/plugins/scrapers/__init__.py diff --git a/src/hircine/plugins/scrapers/anchira.py b/src/hircine/plugins/scrapers/anchira.py new file mode 100644 index 0000000..aa224b9 --- /dev/null +++ b/src/hircine/plugins/scrapers/anchira.py @@ -0,0 +1,101 @@ +import re + +import yaml + +import hircine.enums as enums +from hircine.scraper import Scraper +from hircine.scraper.types import ( + URL, + Artist, + Censorship, + Circle, + Date, + Direction, + Language, + Rating, + Tag, + Title, + World, +) +from hircine.scraper.utils import open_archive_file + +URL_REGEX = re.compile(r"^https?://anchira\.to/g/") + + +class AnchiraYamlScraper(Scraper): + """ + A scraper for ``info.yaml`` files found in archives downloaded from + *anchira.to*. + + .. list-table:: + :align: left + + * - **Requires** + - ``info.yaml`` in the archive or as a sidecar. + * - **Source** + - ``anchira.to`` + """ + + name = "anchira.to info.yaml" + source = "anchira.to" + + def __init__(self, comic): + super().__init__(comic) + + self.data = self.load() + source = self.data.get("Source") + + if source and re.match(URL_REGEX, source): + self.is_available = True + + def load(self): + try: + with open_archive_file(self.comic.archive, "info.yaml") as yif: + return yaml.safe_load(yif) + except Exception: + return {} + + def scrape(self): + parsers = { + "Title": Title, + "Artist": Artist, + "URL": URL, + "Released": Date.from_timestamp, + "Circle": Circle, + "Parody": self.parse_world, + "Tags": self.parse_tag, + } + + for field, parser in parsers.items(): + if field not in self.data: + continue + + value = self.data[field] + + if isinstance(value, list): + yield from [lambda i=x: parser(i) for x in value] + else: + yield lambda: parser(value) + + yield Language(enums.Language.EN) + yield Direction(enums.Direction.RIGHT_TO_LEFT) + + def parse_world(self, input): + match input: + case "Original Work": + return + + return World(input) + + def parse_tag(self, input): + match input: + case "Unlimited": + return + case "Hentai": + return Rating(value=enums.Rating.EXPLICIT) + case "Non-H" | "Ecchi": + return Rating(value=enums.Rating.QUESTIONABLE) + case "Uncensored": + return Censorship(value=enums.Censorship.NONE) + case _: + return Tag.from_string(input) diff --git a/src/hircine/plugins/scrapers/ehentai_api.py b/src/hircine/plugins/scrapers/ehentai_api.py new file mode 100644 index 0000000..70fcf57 --- /dev/null +++ b/src/hircine/plugins/scrapers/ehentai_api.py @@ -0,0 +1,75 @@ +import html +import json +import re + +import requests + +from hircine.scraper import ScrapeError, Scraper + +from .handlers.exhentai import ExHentaiHandler + +API_URL = "https://api.e-hentai.org/api.php" +URL_REGEX = re.compile( + r"^https?://(?:exhentai|e-hentai).org/g/(?P<id>\d+)/(?P<token>[0-9a-fA-F]+).*" +) + + +class EHentaiAPIScraper(Scraper): + """ + A scraper for the `E-Hentai API <https://ehwiki.org/wiki/API>`_. + + .. list-table:: + :align: left + + * - **Requires** + - The comic :attr:`URL <hircine.api.types.FullComic.url>` pointing to + a gallery on *e-hentai.org* or *exhentai.org* + * - **Source** + - ``exhentai`` + + """ + + name = "e-hentai.org API" + source = "exhentai" + + def __init__(self, comic): + super().__init__(comic) + + if self.comic.url: + match = re.fullmatch(URL_REGEX, self.comic.url) + + if match: + self.is_available = True + self.id = int(match.group("id")) + self.token = match.group("token") + + def scrape(self): + data = json.dumps( + { + "method": "gdata", + "gidlist": [[self.id, self.token]], + "namespace": 1, + }, + separators=(",", ":"), + ) + + request = requests.post(API_URL, data=data) + + if request.status_code == requests.codes.ok: + try: + response = json.loads(request.text)["gmetadata"][0] + + title = response.get("title") + if title: + response["title"] = html.unescape(title) + + title_jpn = response.get("title_jpn") + if title_jpn: + response["title_jpn"] = html.unescape(title_jpn) + + handler = ExHentaiHandler() + yield from handler.scrape(response) + except json.JSONDecodeError: + raise ScrapeError("Could not parse JSON response") + else: + raise ScrapeError(f"Request failed with status code {request.status_code}'") diff --git a/src/hircine/plugins/scrapers/gallery_dl.py b/src/hircine/plugins/scrapers/gallery_dl.py new file mode 100644 index 0000000..a6cebc4 --- /dev/null +++ b/src/hircine/plugins/scrapers/gallery_dl.py @@ -0,0 +1,54 @@ +import json + +from hircine.scraper import Scraper +from hircine.scraper.utils import open_archive_file + +from .handlers.dynastyscans import DynastyScansHandler +from .handlers.e621 import E621Handler +from .handlers.exhentai import ExHentaiHandler +from .handlers.mangadex import MangadexHandler + +HANDLERS = { + "dynastyscans": DynastyScansHandler, + "e621": E621Handler, + "exhentai": ExHentaiHandler, + "mangadex": MangadexHandler, +} + + +class GalleryDLScraper(Scraper): + """ + A scraper for `gallery-dl's <https://github.com/mikf/gallery-dl>`_ + ``info.json`` files. For now supports only a select subset of extractors. + + .. list-table:: + :align: left + + * - **Requires** + - ``info.json`` in the archive or as a sidecar. + * - **Source** + - ``dynastyscans``, ``e621``, ``exhentai``, ``mangadex`` + """ + + def __init__(self, comic): + super().__init__(comic) + + self.data = self.load() + category = self.data.get("category") + + if category in HANDLERS.keys(): + self.is_available = True + + self.handler = HANDLERS.get(category)() + self.source = self.handler.source + self.name = f"gallery-dl info.json ({self.source})" + + def load(self): + try: + with open_archive_file(self.comic.archive, "info.json") as jif: + return json.load(jif) + except Exception: + return {} + + def scrape(self): + yield from self.handler.scrape(self.data) diff --git a/src/hircine/plugins/scrapers/handlers/__init__.py b/src/hircine/plugins/scrapers/handlers/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/hircine/plugins/scrapers/handlers/__init__.py diff --git a/src/hircine/plugins/scrapers/handlers/dynastyscans.py b/src/hircine/plugins/scrapers/handlers/dynastyscans.py new file mode 100644 index 0000000..ded015b --- /dev/null +++ b/src/hircine/plugins/scrapers/handlers/dynastyscans.py @@ -0,0 +1,41 @@ +import hircine.enums as enums +from hircine.scraper import ScrapeWarning +from hircine.scraper.types import ( + Artist, + Circle, + Date, + Language, + Title, +) +from hircine.scraper.utils import parse_dict + + +class DynastyScansHandler: + source = "dynastyscans" + + def scrape(self, data): + parsers = { + "date": Date.from_iso, + "lang": self.parse_language, + "author": Artist, + "group": Circle, + } + + yield from parse_dict(parsers, data) + + if manga := data.get("manga"): + title = manga + + if chapter := data.get("chapter"): + title = title + f" Ch. {chapter}" + + if subtitle := data.get("title"): + title = title + f": {subtitle}" + + yield Title(title) + + def parse_language(self, input): + try: + return Language(value=enums.Language[input.upper()]) + except (KeyError, ValueError) as e: + raise ScrapeWarning(f"Could not parse language: '{input}'") from e diff --git a/src/hircine/plugins/scrapers/handlers/e621.py b/src/hircine/plugins/scrapers/handlers/e621.py new file mode 100644 index 0000000..6b798fd --- /dev/null +++ b/src/hircine/plugins/scrapers/handlers/e621.py @@ -0,0 +1,81 @@ +import hircine.enums as enums +from hircine.scraper import ScrapeWarning +from hircine.scraper.types import ( + URL, + Artist, + Category, + Censorship, + Character, + Date, + Language, + Rating, + Tag, + Title, + World, +) +from hircine.scraper.utils import parse_dict + + +def replace_underscore(fun): + return lambda input: fun(input.replace("_", " ")) + + +class E621Handler: + source = "e621" + + ratings = { + "e": Rating(enums.Rating.EXPLICIT), + "q": Rating(enums.Rating.QUESTIONABLE), + "s": Rating(enums.Rating.SAFE), + } + + def scrape(self, data): + match data.get("subcategory"): + case "pool": + yield from self.scrape_pool(data) + + def scrape_pool(self, data): + parsers = { + "date": Date.from_iso, + "rating": self.ratings.get, + "pool": { + "id": lambda pid: URL(f"https://e621.net/pools/{pid}"), + "name": Title, + }, + "tags": { + "general": replace_underscore(Tag.from_string), + "artist": replace_underscore(Artist), + "character": replace_underscore(Character), + "copyright": replace_underscore(World), + "species": replace_underscore(Tag.from_string), + "meta": self.parse_meta, + }, + } + + self.is_likely_uncensored = True + + yield from parse_dict(parsers, data) + + if self.is_likely_uncensored: + yield Censorship(enums.Censorship.NONE) + + def parse_meta(self, input): + match input: + case "comic": + return Category(enums.Category.COMIC) + case "censor_bar": + self.is_likely_uncensored = False + return Censorship(enums.Censorship.BAR) + case "mosaic_censorship": + self.is_likely_uncensored = False + return Censorship(enums.Censorship.MOSAIC) + case "uncensored": + return Censorship(enums.Censorship.NONE) + + if input.endswith("_text"): + lang, _ = input.split("_text", 1) + + try: + return Language(value=enums.Language(lang.capitalize())) + except ValueError as e: + raise ScrapeWarning(f"Could not parse language: '{input}'") from e diff --git a/src/hircine/plugins/scrapers/handlers/exhentai.py b/src/hircine/plugins/scrapers/handlers/exhentai.py new file mode 100644 index 0000000..12c22d7 --- /dev/null +++ b/src/hircine/plugins/scrapers/handlers/exhentai.py @@ -0,0 +1,139 @@ +import re + +import hircine.enums as enums +from hircine.scraper import ScrapeWarning +from hircine.scraper.types import ( + URL, + Artist, + Category, + Censorship, + Character, + Circle, + Date, + Direction, + Language, + OriginalTitle, + Rating, + Tag, + Title, + World, +) +from hircine.scraper.utils import parse_dict + + +def sanitize(title, split=False): + text = re.sub(r"\[[^\]]+\]|{[^}]+}|=[^=]+=|^\([^)]+\)", "", title) + if "|" in text and split: + orig, text = text.split("|", 1) + + return re.sub(r"\s{2,}", " ", text).strip() + + +class ExHentaiHandler: + source = "exhentai" + + def scrape(self, data): + category_field = "eh_category" if "eh_category" in data else "category" + + parsers = { + category_field: self.parse_category, + "posted": Date.from_timestamp, + "date": Date.from_iso, + "lang": self.parse_language, + "tags": self.parse_tag, + "title": lambda t: Title(sanitize(t, split=True)), + "title_jpn": lambda t: OriginalTitle(sanitize(t)), + } + + self.is_likely_pornographic = True + self.is_likely_rtl = False + self.has_censorship_tag = False + self.is_western = False + + yield from parse_dict(parsers, data) + + if self.is_likely_pornographic: + yield Rating(enums.Rating.EXPLICIT) + + if not self.has_censorship_tag: + if self.is_western: + yield Censorship(enums.Censorship.NONE) + else: + yield Censorship(enums.Censorship.BAR) + + if self.is_likely_rtl: + yield Direction(enums.Direction.RIGHT_TO_LEFT) + + if (gid := data["gid"]) and (token := data["token"]): + yield URL(f"https://exhentai.org/g/{gid}/{token}") + + def parse_category(self, input): + match input.lower(): + case "doujinshi": + self.is_likely_rtl = True + return Category(value=enums.Category.DOUJINSHI) + case "manga": + self.is_likely_rtl = True + return Category(value=enums.Category.MANGA) + case "western": + self.is_western = True + case "artist cg": + return Category(value=enums.Category.COMIC) + case "game cg": + return Category(value=enums.Category.GAME_CG) + case "image set": + return Category(value=enums.Category.IMAGE_SET) + case "non-h": + self.is_likely_pornographic = False + return Rating(value=enums.Rating.QUESTIONABLE) + + def parse_tag(self, input): + match input.split(":"): + case ["parody", value]: + return World(value) + case ["group", value]: + return Circle(value) + case ["artist", value]: + return Artist(value) + case ["character", value]: + return Character(value) + case ["language", value]: + return self.parse_language(value, from_value=True) + case ["other", "artbook"]: + return Category(enums.Category.ARTBOOK) + case ["other", "full censorship"]: + self.has_censorship_tag = True + return Censorship(enums.Censorship.FULL) + case ["other", "mosaic censorship"]: + self.has_censorship_tag = True + return Censorship(enums.Censorship.MOSAIC) + case ["other", "uncensored"]: + self.has_censorship_tag = True + return Censorship(enums.Censorship.NONE) + case ["other", "non-h imageset" | "western imageset"]: + return Category(value=enums.Category.IMAGE_SET) + case ["other", "western non-h"]: + self.is_likely_pornographic = False + return Rating(value=enums.Rating.QUESTIONABLE) + case ["other", "comic"]: + return Category(value=enums.Category.COMIC) + case ["other", "variant set"]: + return Category(value=enums.Category.VARIANT_SET) + case ["other", "webtoon"]: + return Category(value=enums.Category.WEBTOON) + case [namespace, tag]: + return Tag(namespace=namespace, tag=tag) + case [tag]: + return Tag(namespace=None, tag=tag) + + def parse_language(self, input, from_value=False): + if not input or input in ["translated", "speechless", "N/A"]: + return + + try: + if from_value: + return Language(value=enums.Language(input.capitalize())) + else: + return Language(value=enums.Language[input.upper()]) + except (KeyError, ValueError) as e: + raise ScrapeWarning(f"Could not parse language: '{input}'") from e diff --git a/src/hircine/plugins/scrapers/handlers/mangadex.py b/src/hircine/plugins/scrapers/handlers/mangadex.py new file mode 100644 index 0000000..7bc371d --- /dev/null +++ b/src/hircine/plugins/scrapers/handlers/mangadex.py @@ -0,0 +1,54 @@ +import hircine.enums as enums +from hircine.scraper import ScrapeWarning +from hircine.scraper.types import ( + URL, + Artist, + Circle, + Date, + Language, + Tag, + Title, +) +from hircine.scraper.utils import parse_dict + + +class MangadexHandler: + source = "mangadex" + + def scrape(self, data): + parsers = { + "date": Date.from_iso, + "lang": self.parse_language, + "tags": Tag.from_string, + "artist": Artist, + "author": Artist, + "group": Circle, + } + + yield from parse_dict(parsers, data) + + if chapter_id := data.get("chapter_id"): + yield URL(f"https://mangadex.org/chapter/{chapter_id}") + + if manga := data.get("manga"): + title = manga + + if volume := data.get("volume"): + title = title + f" Vol. {volume}" + + if chapter := data.get("chapter"): + if volume: + title = title + f", Ch. {chapter}" + else: + title = title + f"Ch. {chapter}" + + if subtitle := data.get("title"): + title = title + f": {subtitle}" + + yield Title(title) + + def parse_language(self, input): + try: + return Language(value=enums.Language[input.upper()]) + except (KeyError, ValueError) as e: + raise ScrapeWarning(f"Could not parse language: '{input}'") from e |