summaryrefslogtreecommitdiffstatshomepage
path: root/src/hircine/plugins/scrapers/handlers
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/hircine/plugins/scrapers/handlers/__init__.py0
-rw-r--r--src/hircine/plugins/scrapers/handlers/dynastyscans.py41
-rw-r--r--src/hircine/plugins/scrapers/handlers/e621.py81
-rw-r--r--src/hircine/plugins/scrapers/handlers/exhentai.py139
-rw-r--r--src/hircine/plugins/scrapers/handlers/mangadex.py54
5 files changed, 315 insertions, 0 deletions
diff --git a/src/hircine/plugins/scrapers/handlers/__init__.py b/src/hircine/plugins/scrapers/handlers/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/__init__.py
diff --git a/src/hircine/plugins/scrapers/handlers/dynastyscans.py b/src/hircine/plugins/scrapers/handlers/dynastyscans.py
new file mode 100644
index 0000000..ded015b
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/dynastyscans.py
@@ -0,0 +1,41 @@
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ Artist,
+ Circle,
+ Date,
+ Language,
+ Title,
+)
+from hircine.scraper.utils import parse_dict
+
+
+class DynastyScansHandler:
+ source = "dynastyscans"
+
+ def scrape(self, data):
+ parsers = {
+ "date": Date.from_iso,
+ "lang": self.parse_language,
+ "author": Artist,
+ "group": Circle,
+ }
+
+ yield from parse_dict(parsers, data)
+
+ if manga := data.get("manga"):
+ title = manga
+
+ if chapter := data.get("chapter"):
+ title = title + f" Ch. {chapter}"
+
+ if subtitle := data.get("title"):
+ title = title + f": {subtitle}"
+
+ yield Title(title)
+
+ def parse_language(self, input):
+ try:
+ return Language(value=enums.Language[input.upper()])
+ except (KeyError, ValueError) as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e
diff --git a/src/hircine/plugins/scrapers/handlers/e621.py b/src/hircine/plugins/scrapers/handlers/e621.py
new file mode 100644
index 0000000..6b798fd
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/e621.py
@@ -0,0 +1,81 @@
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Category,
+ Censorship,
+ Character,
+ Date,
+ Language,
+ Rating,
+ Tag,
+ Title,
+ World,
+)
+from hircine.scraper.utils import parse_dict
+
+
+def replace_underscore(fun):
+ return lambda input: fun(input.replace("_", " "))
+
+
+class E621Handler:
+ source = "e621"
+
+ ratings = {
+ "e": Rating(enums.Rating.EXPLICIT),
+ "q": Rating(enums.Rating.QUESTIONABLE),
+ "s": Rating(enums.Rating.SAFE),
+ }
+
+ def scrape(self, data):
+ match data.get("subcategory"):
+ case "pool":
+ yield from self.scrape_pool(data)
+
+ def scrape_pool(self, data):
+ parsers = {
+ "date": Date.from_iso,
+ "rating": self.ratings.get,
+ "pool": {
+ "id": lambda pid: URL(f"https://e621.net/pools/{pid}"),
+ "name": Title,
+ },
+ "tags": {
+ "general": replace_underscore(Tag.from_string),
+ "artist": replace_underscore(Artist),
+ "character": replace_underscore(Character),
+ "copyright": replace_underscore(World),
+ "species": replace_underscore(Tag.from_string),
+ "meta": self.parse_meta,
+ },
+ }
+
+ self.is_likely_uncensored = True
+
+ yield from parse_dict(parsers, data)
+
+ if self.is_likely_uncensored:
+ yield Censorship(enums.Censorship.NONE)
+
+ def parse_meta(self, input):
+ match input:
+ case "comic":
+ return Category(enums.Category.COMIC)
+ case "censor_bar":
+ self.is_likely_uncensored = False
+ return Censorship(enums.Censorship.BAR)
+ case "mosaic_censorship":
+ self.is_likely_uncensored = False
+ return Censorship(enums.Censorship.MOSAIC)
+ case "uncensored":
+ return Censorship(enums.Censorship.NONE)
+
+ if input.endswith("_text"):
+ lang, _ = input.split("_text", 1)
+
+ try:
+ return Language(value=enums.Language(lang.capitalize()))
+ except ValueError as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e
diff --git a/src/hircine/plugins/scrapers/handlers/exhentai.py b/src/hircine/plugins/scrapers/handlers/exhentai.py
new file mode 100644
index 0000000..12c22d7
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/exhentai.py
@@ -0,0 +1,139 @@
+import re
+
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Category,
+ Censorship,
+ Character,
+ Circle,
+ Date,
+ Direction,
+ Language,
+ OriginalTitle,
+ Rating,
+ Tag,
+ Title,
+ World,
+)
+from hircine.scraper.utils import parse_dict
+
+
+def sanitize(title, split=False):
+ text = re.sub(r"\[[^\]]+\]|{[^}]+}|=[^=]+=|^\([^)]+\)", "", title)
+ if "|" in text and split:
+ orig, text = text.split("|", 1)
+
+ return re.sub(r"\s{2,}", " ", text).strip()
+
+
+class ExHentaiHandler:
+ source = "exhentai"
+
+ def scrape(self, data):
+ category_field = "eh_category" if "eh_category" in data else "category"
+
+ parsers = {
+ category_field: self.parse_category,
+ "posted": Date.from_timestamp,
+ "date": Date.from_iso,
+ "lang": self.parse_language,
+ "tags": self.parse_tag,
+ "title": lambda t: Title(sanitize(t, split=True)),
+ "title_jpn": lambda t: OriginalTitle(sanitize(t)),
+ }
+
+ self.is_likely_pornographic = True
+ self.is_likely_rtl = False
+ self.has_censorship_tag = False
+ self.is_western = False
+
+ yield from parse_dict(parsers, data)
+
+ if self.is_likely_pornographic:
+ yield Rating(enums.Rating.EXPLICIT)
+
+ if not self.has_censorship_tag:
+ if self.is_western:
+ yield Censorship(enums.Censorship.NONE)
+ else:
+ yield Censorship(enums.Censorship.BAR)
+
+ if self.is_likely_rtl:
+ yield Direction(enums.Direction.RIGHT_TO_LEFT)
+
+ if (gid := data["gid"]) and (token := data["token"]):
+ yield URL(f"https://exhentai.org/g/{gid}/{token}")
+
+ def parse_category(self, input):
+ match input.lower():
+ case "doujinshi":
+ self.is_likely_rtl = True
+ return Category(value=enums.Category.DOUJINSHI)
+ case "manga":
+ self.is_likely_rtl = True
+ return Category(value=enums.Category.MANGA)
+ case "western":
+ self.is_western = True
+ case "artist cg":
+ return Category(value=enums.Category.COMIC)
+ case "game cg":
+ return Category(value=enums.Category.GAME_CG)
+ case "image set":
+ return Category(value=enums.Category.IMAGE_SET)
+ case "non-h":
+ self.is_likely_pornographic = False
+ return Rating(value=enums.Rating.QUESTIONABLE)
+
+ def parse_tag(self, input):
+ match input.split(":"):
+ case ["parody", value]:
+ return World(value)
+ case ["group", value]:
+ return Circle(value)
+ case ["artist", value]:
+ return Artist(value)
+ case ["character", value]:
+ return Character(value)
+ case ["language", value]:
+ return self.parse_language(value, from_value=True)
+ case ["other", "artbook"]:
+ return Category(enums.Category.ARTBOOK)
+ case ["other", "full censorship"]:
+ self.has_censorship_tag = True
+ return Censorship(enums.Censorship.FULL)
+ case ["other", "mosaic censorship"]:
+ self.has_censorship_tag = True
+ return Censorship(enums.Censorship.MOSAIC)
+ case ["other", "uncensored"]:
+ self.has_censorship_tag = True
+ return Censorship(enums.Censorship.NONE)
+ case ["other", "non-h imageset" | "western imageset"]:
+ return Category(value=enums.Category.IMAGE_SET)
+ case ["other", "western non-h"]:
+ self.is_likely_pornographic = False
+ return Rating(value=enums.Rating.QUESTIONABLE)
+ case ["other", "comic"]:
+ return Category(value=enums.Category.COMIC)
+ case ["other", "variant set"]:
+ return Category(value=enums.Category.VARIANT_SET)
+ case ["other", "webtoon"]:
+ return Category(value=enums.Category.WEBTOON)
+ case [namespace, tag]:
+ return Tag(namespace=namespace, tag=tag)
+ case [tag]:
+ return Tag(namespace=None, tag=tag)
+
+ def parse_language(self, input, from_value=False):
+ if not input or input in ["translated", "speechless", "N/A"]:
+ return
+
+ try:
+ if from_value:
+ return Language(value=enums.Language(input.capitalize()))
+ else:
+ return Language(value=enums.Language[input.upper()])
+ except (KeyError, ValueError) as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e
diff --git a/src/hircine/plugins/scrapers/handlers/mangadex.py b/src/hircine/plugins/scrapers/handlers/mangadex.py
new file mode 100644
index 0000000..7bc371d
--- /dev/null
+++ b/src/hircine/plugins/scrapers/handlers/mangadex.py
@@ -0,0 +1,54 @@
+import hircine.enums as enums
+from hircine.scraper import ScrapeWarning
+from hircine.scraper.types import (
+ URL,
+ Artist,
+ Circle,
+ Date,
+ Language,
+ Tag,
+ Title,
+)
+from hircine.scraper.utils import parse_dict
+
+
+class MangadexHandler:
+ source = "mangadex"
+
+ def scrape(self, data):
+ parsers = {
+ "date": Date.from_iso,
+ "lang": self.parse_language,
+ "tags": Tag.from_string,
+ "artist": Artist,
+ "author": Artist,
+ "group": Circle,
+ }
+
+ yield from parse_dict(parsers, data)
+
+ if chapter_id := data.get("chapter_id"):
+ yield URL(f"https://mangadex.org/chapter/{chapter_id}")
+
+ if manga := data.get("manga"):
+ title = manga
+
+ if volume := data.get("volume"):
+ title = title + f" Vol. {volume}"
+
+ if chapter := data.get("chapter"):
+ if volume:
+ title = title + f", Ch. {chapter}"
+ else:
+ title = title + f"Ch. {chapter}"
+
+ if subtitle := data.get("title"):
+ title = title + f": {subtitle}"
+
+ yield Title(title)
+
+ def parse_language(self, input):
+ try:
+ return Language(value=enums.Language[input.upper()])
+ except (KeyError, ValueError) as e:
+ raise ScrapeWarning(f"Could not parse language: '{input}'") from e