summaryrefslogtreecommitdiffstatshomepage
path: root/src/hircine/plugins
diff options
context:
space:
mode:
authorWolfgang Müller2024-03-05 18:08:09 +0100
committerWolfgang Müller2024-03-05 19:25:59 +0100
commitd1d654ebac2d51e3841675faeb56480e440f622f (patch)
tree56ef123c1a15a10dfd90836e4038e27efde950c6 /src/hircine/plugins
downloadhircine-d1d654ebac2d51e3841675faeb56480e440f622f.tar.gz
Initial commit0.1.0
Diffstat (limited to 'src/hircine/plugins')
-rw-r--r--src/hircine/plugins/__init__.py49
-rw-r--r--src/hircine/plugins/scrapers/__init__.py0
-rw-r--r--src/hircine/plugins/scrapers/anchira.py101
-rw-r--r--src/hircine/plugins/scrapers/ehentai_api.py75
-rw-r--r--src/hircine/plugins/scrapers/gallery_dl.py54
-rw-r--r--src/hircine/plugins/scrapers/handlers/__init__.py0
-rw-r--r--src/hircine/plugins/scrapers/handlers/dynastyscans.py41
-rw-r--r--src/hircine/plugins/scrapers/handlers/e621.py81
-rw-r--r--src/hircine/plugins/scrapers/handlers/exhentai.py139
-rw-r--r--src/hircine/plugins/scrapers/handlers/mangadex.py54
10 files changed, 594 insertions, 0 deletions
diff --git a/src/hircine/plugins/__init__.py b/src/hircine/plugins/__init__.py
new file mode 100644
index 0000000..27e55a7
--- /dev/null
+++ b/src/hircine/plugins/__init__.py
@@ -0,0 +1,49 @@
+from importlib.metadata import entry_points
+from typing import Dict, Type
+
+from hircine.scraper import Scraper
+
+scraper_registry: Dict[str, Type[Scraper]] = {}
+transformers = []
+
+
+def get_scraper(name):
+ return scraper_registry.get(name, None)
+
+
+def get_scrapers():
+ return scraper_registry.items()
+
+
+def register_scraper(name, cls):
+ scraper_registry[name] = cls
+
+
+def transformer(function):
+ """
+ Marks the decorated function as a transformer.
+
+ The decorated function must be a generator function that yields
+ :ref:`scraped-data`. The following parameters will be available to the
+ decorated function:
+
+ :param generator: The scraper's generator function.
+ :param ScraperInfo info: Information on the scraper.
+ """
+
+ def _decorate(function):
+ transformers.append(function)
+ return function
+
+ return _decorate(function)
+
+
+def load(): # pragma: nocover
+ for entry in entry_points(group="hircine.scraper"):
+ register_scraper(entry.name, entry.load())
+
+ for entry in entry_points(group="hircine.transformer"):
+ entry.load()
+
+
+load()
diff --git a/src/hircine/plugins/scrapers/__init__.py b/src/hircine/plugins/scrapers/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/hircine/plugins/scrapers/__init__.py
diff --git a/src/hircine/plugins/scrapers/anchira.py b/src/hircine/plugins/scrapers/anchira.py
new file mode 100644
index 0000000..aa224b9
--- /dev/null
+++ b/src/hircine/plugins/scrapers/anchira.py
@@ -0,0 +1,101 @@
+import re
+
+import yaml
+
+import hircine.enums as enums
+from hircine.scraper import Scraper
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Censorship,
+ Circle,
+ Date,
+ Direction,
+ Language,
+ Rating,
+ Tag,
+ Title,
+ World,
+)
+from hircine.scraper.utils import open_archive_file
+
+URL_REGEX = re.compile(r"^https?://anchira\.to/g/")
+
+
+class AnchiraYamlScraper(Scraper):
+ """
+ A scraper for ``info.yaml`` files found in archives downloaded from
+ *anchira.to*.
+
+ .. list-table::
+ :align: left
+
+ * - **Requires**
+ - ``info.yaml`` in the archive or as a sidecar.
+ * - **Source**
+ - ``anchira.to``
+ """
+
+ name = "anchira.to info.yaml"
+ source = "anchira.to"
+
+ def __init__(self, comic):
+ super().__init__(comic)
+
+ self.data = self.load()
+ source = self.data.get("Source")
+
+ if source and re.match(URL_REGEX, source):
+ self.is_available = True
+
+ def load(self):
+ try:
+ with open_archive_file(self.comic.archive, "info.yaml") as yif:
+ return yaml.safe_load(yif)
+ except Exception:
+ return {}
+
+ def scrape(self):
+ parsers = {
+ "Title": Title,
+ "Artist": Artist,
+ "URL": URL,
+ "Released": Date.from_timestamp,
+ "Circle": Circle,
+ "Parody": self.parse_world,
+ "Tags": self.parse_tag,
+ }
+
+ for field, parser in parsers.items():
+ if field not in self.data:
+ continue
+
+ value = self.data[field]
+
+ if isinstance(value, list):
+ yield from [lambda i=x: parser(i) for x in value]
+ else:
+ yield lambda: parser(value)
+
+ yield Language(enums.Language.EN)
+ yield Direction(enums.Direction.RIGHT_TO_LEFT)
+
+ def parse_world(self, input):
+ match input:
+ case "Original Work":
+ return
+
+ return World(input)
+
+ def parse_tag(self, input):
+ match input:
+ case "Unlimited":
+ return
+ case "Hentai":
+ return Rating(value=enums.Rating.EXPLICIT)
+ case "Non-H" | "Ecchi":
+ return Rating(value=enums.Rating.QUESTIONABLE)
+ case "Uncensored":
+ return Censorship(value=enums.Censorship.NONE)
+ case _:
+ return Tag.from_string(input)
diff --git a/src/hircine/plugins/scrapers/ehentai_api.py b/src/hircine/plugins/scrapers/ehentai_api.py
new file mode 100644
index 0000000..70fcf57
--- /dev/null
+++ b/src/hircine/plugins/scrapers/ehentai_api.py
@@ -0,0 +1,75 @@
+import html
+import json
+import re
+
+import requests
+
+from hircine.scraper import ScrapeError, Scraper
+
+from .handlers.exhentai import ExHentaiHandler
+
+API_URL = "https://api.e-hentai.org/api.php"
+URL_REGEX = re.compile(
+ r"^https?://(?:exhentai|e-hentai).org/g/(?P<id>\d+)/(?P<token>[0-9a-fA-F]+).*"
+)
+
+
+class EHentaiAPIScraper(Scraper):
+ """
+ A scraper for the `E-Hentai API <https://ehwiki.org/wiki/API>`_.
+
+ .. list-table::
+ :align: left
+
+ * - **Requires**
+ - The comic :attr:`URL <hircine.api.types.FullComic.url>` pointing to
+ a gallery on *e-hentai.org* or *exhentai.org*
+ * - **Source**
+ - ``exhentai``
+
+ """
+
+ name = "e-hentai.org API"
+ source = "exhentai"
+
+ def __init__(self, comic):
+ super().__init__(comic)
+
+ if self.comic.url:
+ match = re.fullmatch(URL_REGEX, self.comic.url)
+
+ if match:
+ self.is_available = True
+ self.id = int(match.group("id"))
+ self.token = match.group("token")
+
+ def scrape(self):
+ data = json.dumps(
+ {
+ "method": "gdata",
+ "gidlist": [[self.id, self.token]],
+ "namespace": 1,
+ },
+ separators=(",", ":"),
+ )
+
+ request = requests.post(API_URL, data=data)
+
+ if request.status_code == requests.codes.ok:
+ try:
+ response = json.loads(request.text)["gmetadata"][0]
+
+ title = response.get("title")
+ if title:
+ response["title"] = html.unescape(title)
+
+ title_jpn = response.get("title_jpn")
+ if title_jpn:
+ response["title_jpn"] = html.unescape(title_jpn)
+
+ handler = ExHentaiHandler()
+ yield from handler.scrape(response)
+ except json.JSONDecodeError:
+ raise ScrapeError("Could not parse JSON response")
+ else:
+ raise ScrapeError(f"Request failed with status code {request.status_code}'")
diff --git a/src/hircine/plugins/scrapers/gallery_dl.py b/src/hircine/plugins/scrapers/gallery_dl.py
new file mode 100644
index 0000000..a6cebc4
--- /dev/null
+++ b/src/hircine/plugins/scrapers/gallery_dl.py
@@ -0,0 +1,54 @@
+import json
+
+from hircine.scraper import Scraper
+from hircine.scraper.utils import open_archive_file
+
+from .handlers.dynastyscans import DynastyScansHandler
+from .handlers.e621 import E621Handler
+from .handlers.exhentai import ExHentaiHandler
+from .handlers.mangadex import MangadexHandler
+
+HANDLERS = {
+ "dynastyscans": DynastyScansHandler,
+ "e621": E621Handler,
+ "exhentai": ExHentaiHandler,
+ "mangadex": MangadexHandler,
+}
+
+
+class GalleryDLScraper(Scraper):
+ """
+ A scraper for `gallery-dl's <https://github.com/mikf/gallery-dl>`_
+ ``info.json`` files. For now supports only a select subset of extractors.
+
+ .. list-table::
+ :align: left
+
+ * - **Requires**
+ - ``info.json`` in the archive or as a sidecar.
+ * - **Source**
+ - ``dynastyscans``, ``e621``, ``exhentai``, ``mangadex``
+ """
+
+ def __init__(self, comic):
+ super().__init__(comic)
+
+ self.data = self.load()
+ category = self.data.get("category")
+
+ if category in HANDLERS.keys():
+ self.is_available = True
+
+ self.handler = HANDLERS.get(category)()
+ self.source = self.handler.source
+ self.name = f"gallery-dl info.json ({self.source})"
+
+ def load(self):
+ try:
+ with open_archive_file(self.comic.archive, "info.json") as jif:
+ return json.load(jif)
+ except Exception:
+ return {}
+
+ def scrape(self):
+ yield from self.handler.scrape(self.data)
diff --git a/src/hircine/plugins/scrapers/handlers/__init__.py b/src/hircine/plugins/scrapers/handlers/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/__init__.py
diff --git a/src/hircine/plugins/scrapers/handlers/dynastyscans.py b/src/hircine/plugins/scrapers/handlers/dynastyscans.py
new file mode 100644
index 0000000..ded015b
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/dynastyscans.py
@@ -0,0 +1,41 @@
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ Artist,
+ Circle,
+ Date,
+ Language,
+ Title,
+)
+from hircine.scraper.utils import parse_dict
+
+
+class DynastyScansHandler:
+ source = "dynastyscans"
+
+ def scrape(self, data):
+ parsers = {
+ "date": Date.from_iso,
+ "lang": self.parse_language,
+ "author": Artist,
+ "group": Circle,
+ }
+
+ yield from parse_dict(parsers, data)
+
+ if manga := data.get("manga"):
+ title = manga
+
+ if chapter := data.get("chapter"):
+ title = title + f" Ch. {chapter}"
+
+ if subtitle := data.get("title"):
+ title = title + f": {subtitle}"
+
+ yield Title(title)
+
+ def parse_language(self, input):
+ try:
+ return Language(value=enums.Language[input.upper()])
+ except (KeyError, ValueError) as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e
diff --git a/src/hircine/plugins/scrapers/handlers/e621.py b/src/hircine/plugins/scrapers/handlers/e621.py
new file mode 100644
index 0000000..6b798fd
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/e621.py
@@ -0,0 +1,81 @@
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Category,
+ Censorship,
+ Character,
+ Date,
+ Language,
+ Rating,
+ Tag,
+ Title,
+ World,
+)
+from hircine.scraper.utils import parse_dict
+
+
+def replace_underscore(fun):
+ return lambda input: fun(input.replace("_", " "))
+
+
+class E621Handler:
+ source = "e621"
+
+ ratings = {
+ "e": Rating(enums.Rating.EXPLICIT),
+ "q": Rating(enums.Rating.QUESTIONABLE),
+ "s": Rating(enums.Rating.SAFE),
+ }
+
+ def scrape(self, data):
+ match data.get("subcategory"):
+ case "pool":
+ yield from self.scrape_pool(data)
+
+ def scrape_pool(self, data):
+ parsers = {
+ "date": Date.from_iso,
+ "rating": self.ratings.get,
+ "pool": {
+ "id": lambda pid: URL(f"https://e621.net/pools/{pid}"),
+ "name": Title,
+ },
+ "tags": {
+ "general": replace_underscore(Tag.from_string),
+ "artist": replace_underscore(Artist),
+ "character": replace_underscore(Character),
+ "copyright": replace_underscore(World),
+ "species": replace_underscore(Tag.from_string),
+ "meta": self.parse_meta,
+ },
+ }
+
+ self.is_likely_uncensored = True
+
+ yield from parse_dict(parsers, data)
+
+ if self.is_likely_uncensored:
+ yield Censorship(enums.Censorship.NONE)
+
+ def parse_meta(self, input):
+ match input:
+ case "comic":
+ return Category(enums.Category.COMIC)
+ case "censor_bar":
+ self.is_likely_uncensored = False
+ return Censorship(enums.Censorship.BAR)
+ case "mosaic_censorship":
+ self.is_likely_uncensored = False
+ return Censorship(enums.Censorship.MOSAIC)
+ case "uncensored":
+ return Censorship(enums.Censorship.NONE)
+
+ if input.endswith("_text"):
+ lang, _ = input.split("_text", 1)
+
+ try:
+ return Language(value=enums.Language(lang.capitalize()))
+ except ValueError as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e
diff --git a/src/hircine/plugins/scrapers/handlers/exhentai.py b/src/hircine/plugins/scrapers/handlers/exhentai.py
new file mode 100644
index 0000000..12c22d7
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/exhentai.py
@@ -0,0 +1,139 @@
+import re
+
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Category,
+ Censorship,
+ Character,
+ Circle,
+ Date,
+ Direction,
+ Language,
+ OriginalTitle,
+ Rating,
+ Tag,
+ Title,
+ World,
+)
+from hircine.scraper.utils import parse_dict
+
+
+def sanitize(title, split=False):
+ text = re.sub(r"\[[^\]]+\]|{[^}]+}|=[^=]+=|^\([^)]+\)", "", title)
+ if "|" in text and split:
+ orig, text = text.split("|", 1)
+
+ return re.sub(r"\s{2,}", " ", text).strip()
+
+
+class ExHentaiHandler:
+ source = "exhentai"
+
+ def scrape(self, data):
+ category_field = "eh_category" if "eh_category" in data else "category"
+
+ parsers = {
+ category_field: self.parse_category,
+ "posted": Date.from_timestamp,
+ "date": Date.from_iso,
+ "lang": self.parse_language,
+ "tags": self.parse_tag,
+ "title": lambda t: Title(sanitize(t, split=True)),
+ "title_jpn": lambda t: OriginalTitle(sanitize(t)),
+ }
+
+ self.is_likely_pornographic = True
+ self.is_likely_rtl = False
+ self.has_censorship_tag = False
+ self.is_western = False
+
+ yield from parse_dict(parsers, data)
+
+ if self.is_likely_pornographic:
+ yield Rating(enums.Rating.EXPLICIT)
+
+ if not self.has_censorship_tag:
+ if self.is_western:
+ yield Censorship(enums.Censorship.NONE)
+ else:
+ yield Censorship(enums.Censorship.BAR)
+
+ if self.is_likely_rtl:
+ yield Direction(enums.Direction.RIGHT_TO_LEFT)
+
+ if (gid := data["gid"]) and (token := data["token"]):
+ yield URL(f"https://exhentai.org/g/{gid}/{token}")
+
+ def parse_category(self, input):
+ match input.lower():
+ case "doujinshi":
+ self.is_likely_rtl = True
+ return Category(value=enums.Category.DOUJINSHI)
+ case "manga":
+ self.is_likely_rtl = True
+ return Category(value=enums.Category.MANGA)
+ case "western":
+ self.is_western = True
+ case "artist cg":
+ return Category(value=enums.Category.COMIC)
+ case "game cg":
+ return Category(value=enums.Category.GAME_CG)
+ case "image set":
+ return Category(value=enums.Category.IMAGE_SET)
+ case "non-h":
+ self.is_likely_pornographic = False
+ return Rating(value=enums.Rating.QUESTIONABLE)
+
+ def parse_tag(self, input):
+ match input.split(":"):
+ case ["parody", value]:
+ return World(value)
+ case ["group", value]:
+ return Circle(value)
+ case ["artist", value]:
+ return Artist(value)
+ case ["character", value]:
+ return Character(value)
+ case ["language", value]:
+ return self.parse_language(value, from_value=True)
+ case ["other", "artbook"]:
+ return Category(enums.Category.ARTBOOK)
+ case ["other", "full censorship"]:
+ self.has_censorship_tag = True
+ return Censorship(enums.Censorship.FULL)
+ case ["other", "mosaic censorship"]:
+ self.has_censorship_tag = True
+ return Censorship(enums.Censorship.MOSAIC)
+ case ["other", "uncensored"]:
+ self.has_censorship_tag = True
+ return Censorship(enums.Censorship.NONE)
+ case ["other", "non-h imageset" | "western imageset"]:
+ return Category(value=enums.Category.IMAGE_SET)
+ case ["other", "western non-h"]:
+ self.is_likely_pornographic = False
+ return Rating(value=enums.Rating.QUESTIONABLE)
+ case ["other", "comic"]:
+ return Category(value=enums.Category.COMIC)
+ case ["other", "variant set"]:
+ return Category(value=enums.Category.VARIANT_SET)
+ case ["other", "webtoon"]:
+ return Category(value=enums.Category.WEBTOON)
+ case [namespace, tag]:
+ return Tag(namespace=namespace, tag=tag)
+ case [tag]:
+ return Tag(namespace=None, tag=tag)
+
+ def parse_language(self, input, from_value=False):
+ if not input or input in ["translated", "speechless", "N/A"]:
+ return
+
+ try:
+ if from_value:
+ return Language(value=enums.Language(input.capitalize()))
+ else:
+ return Language(value=enums.Language[input.upper()])
+ except (KeyError, ValueError) as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e
diff --git a/src/hircine/plugins/scrapers/handlers/mangadex.py b/src/hircine/plugins/scrapers/handlers/mangadex.py
new file mode 100644
index 0000000..7bc371d
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/mangadex.py
@@ -0,0 +1,54 @@
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Circle,
+ Date,
+ Language,
+ Tag,
+ Title,
+)
+from hircine.scraper.utils import parse_dict
+
+
+class MangadexHandler:
+ source = "mangadex"
+
+ def scrape(self, data):
+ parsers = {
+ "date": Date.from_iso,
+ "lang": self.parse_language,
+ "tags": Tag.from_string,
+ "artist": Artist,
+ "author": Artist,
+ "group": Circle,
+ }
+
+ yield from parse_dict(parsers, data)
+
+ if chapter_id := data.get("chapter_id"):
+ yield URL(f"https://mangadex.org/chapter/{chapter_id}")
+
+ if manga := data.get("manga"):
+ title = manga
+
+ if volume := data.get("volume"):
+ title = title + f" Vol. {volume}"
+
+ if chapter := data.get("chapter"):
+ if volume:
+ title = title + f", Ch. {chapter}"
+ else:
+ title = title + f"Ch. {chapter}"
+
+ if subtitle := data.get("title"):
+ title = title + f": {subtitle}"
+
+ yield Title(title)
+
+ def parse_language(self, input):
+ try:
+ return Language(value=enums.Language[input.upper()])
+ except (KeyError, ValueError) as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e