| |
| import re |
|
|
| from ultradata_math_parser.utils import * |
| from ultradata_math_parser.parsers.base_parser import BaseParser |
| from ultradata_math_parser.parsers.title_parser import TitleParser |
|
|
|
|
| class CustomParser(BaseParser): |
| def __init__(self) -> None: |
| super().__init__() |
|
|
| def use_clean_rule(self, tree, clean_rules): |
| for clean_rule in clean_rules: |
| for x in tree.xpath(clean_rule): |
| self.remove_node(x) |
| return tree |
|
|
| def use_extract_rule(self, tree, extract_rule): |
| if "/text()" in extract_rule["value"]: |
| return "".join(tree.xpath(extract_rule["value"])).strip() |
| return tree.xpath(extract_rule["value"])[0] |
|
|
| def extract(self, html="", base_url="", rule={}, **kwargs) -> dict: |
| self.include_images = kwargs.get("include_images", False) |
| tree = load_html(html) |
| if tree is None: |
| raise ValueError |
|
|
| |
| base_href = tree.xpath("//base/@href") |
|
|
| if base_href and "http" in base_href[0]: |
| base_url = base_href[0] |
|
|
| if "clean" in rule: |
| tree = self.use_clean_rule(tree, rule["clean"]) |
|
|
| |
| if "title" not in rule: |
| title = TitleParser().process(tree) |
| else: |
| title = self.use_extract_rule(tree, rule["title"]) |
|
|
| |
| try: |
| body_tree = self.use_extract_rule(tree, rule["content"]) |
| except: |
| raise ValueError |
| if not self.include_images: |
| self._remove_images_from_tree(body_tree) |
| body_html = tostring(body_tree, encoding=str) |
| body_html = self._strip_images_from_html(body_html) |
|
|
| text_length = self._text_length_from_html(body_html) |
|
|
| return { |
| "xp_num": "custom", |
| "drop_list": False, |
| "html": body_html, |
| "title": title, |
| "base_url": base_url, |
| "text_length": text_length, |
| } |
|
|