from __future__ import annotations import re from html import unescape from html.parser import HTMLParser from typing import Any class TextOnlyParser(HTMLParser): def __init__(self) -> None: super().__init__() self.active = True self.buf: list[str] = [] self.skiplist = set(["script", "style"]) def handle_starttag(self, tag: str, attrs: Any) -> None: if tag in self.skiplist: self.active = False def handle_endtag(self, tag: str) -> None: if tag in self.skiplist: self.active = True def handle_data(self, data: str) -> None: if self.active and data: self.buf.append(data) def get_text(self) -> str: messy = "".join(self.buf) return " ".join(messy.split()) def html2text(html: str, skip_pre: bool = False) -> str: parser = TextOnlyParser() if skip_pre: parser.skiplist.add("pre") parser.feed(html) return parser.get_text() def extract_signal_styles(markup: str) -> tuple[str, list[str]]: """Convert HTML syntax to Signal text styles. This implementation has limited functionality, and only supports the features we do use: * only supports and tags * does not support nested (text) tags Example: >>> extract_signal_styles("foo bar") "foo bar", ["0:3:BOLD"] """ text = "" styles: list[str] = [] tag, tag_idx = "", 0 for part in re.split(r"()", markup): if part == "": tag = "BOLD" tag_idx = len(text) elif part == "": assert tag == "BOLD" len_tagged = len(text) - tag_idx styles.append(f"{tag_idx}:{len_tagged}:{tag}") elif part == "": tag = "MONOSPACE" tag_idx = len(text) elif part == "": assert tag == "MONOSPACE" len_tagged = len(text) - tag_idx styles.append(f"{tag_idx}:{len_tagged}:{tag}") else: text += unescape(part) return text, styles