import re from textwrap import fill import requests from bs4 import NavigableString from bs4 import BeautifulSoup from markdownify import MarkdownConverter, chomp, UNDERLINED, ATX_CLOSED import copy from . import picture_process #
是否是单元格内部的换行符 def judge_br_in_table(el): if el.name in ['td', 'tr']: return True if el.parent is None: return False # 递归父级元素 return judge_br_in_table(el.parent) # 获取div标签中是否为标题，如果是标题则markdown中的返回标题等级 def get_markdown_title_level(el): if el.name != 'div' or 'class' not in el.attrs: return '' title_level = '' if 'hdwiki_tmml' in el.attrs['class']: title_level = '## ' elif 'hdwiki_tmmll' in el.attrs['class']: title_level = '### ' return title_level def str_is_title(text) -> bool: text = text.strip() pattern = r'^#+' # 使用re.search匹配字符串开头的 # 符号 match = re.search(pattern, text) if match: return True else: return False # 判断el 是否是图片的DIV标签 def is_img_div_tag(el) -> bool: if el is None: return False if el.name != "div": return False class_attr = el.get('class') if class_attr is None: return False if "img" in class_attr or "img_l" in class_attr: return True else: return False # 判断div内部是否是纯文本内容，并且display是否为block def is_only_text_div(el) -> bool: if el is None or el.name != "div" or el.text == "": return False if el.get("display", "block") != "block": return False # div标签下只包含文本 if isinstance(el.string, NavigableString): return True # 兼容

1. 版本概述

判断错误问题 # 递归获取所有子标签 child_tags = el.find_all(recursive=True) for tag in child_tags: if tag.text == "": continue if tag.name in ["table", "td", "img"]: return False if isinstance(tag.string, NavigableString): continue else: return False return True # a标签是否在图片的div标签内部 def a_tag_is_in_img(el) -> bool: if el.parent is None: return False if el.name != "a" or el.parent.name != "div": return False return is_img_div_tag(el.parent) class CustomMarkDownConverter(MarkdownConverter): """ 创建自定义的换行装换函数 """ def __init__(self, img_download_path, **options): super().__init__(**options) self.img_download_path = img_download_path # 单元格内的换行依旧保持
格式 def convert_br(self, el, text, convert_as_inline): if judge_br_in_table(el): return "
" # 容错处理(文章4696)，因bs4解析html错误导致将分类图标签解析到了br标签下导致图片丢失 if text.strip(): return text + "\n" return super().convert_br(el, text, convert_as_inline) # 图片div标签在图片与图片描述之间添加换行 @staticmethod def convert_img_div(text): pattern = r'\*\*(.*?)\*\*' match = re.search(pattern, text) if match: start_index = match.start() text = text[:start_index] + "\n" + text[start_index:] return text # 装换标题格式 def convert_div(self, el, text, convert_as_inline): title_level = get_markdown_title_level(el) if title_level != '': return "\n\n" + title_level + text + '\n\n' if is_img_div_tag(el): # 图片与图片描述文字之间掺入换行符 return self.convert_img_div(text) if is_only_text_div(el): text = "\n\n" + text + "\n\n" return text # 检查 URL 是否有效的函数 @staticmethod def is_valid_url(url): try: response = requests.head(url, allow_redirects=True) return response.status_code == 200 except requests.RequestException: return False @staticmethod def try_complete_img_description(img_el): if img_el is None or img_el.name != "img": return # 找到父级的div标签 img_el_parent_div = None cur_el = img_el while cur_el.parent is not None: if is_img_div_tag(cur_el.parent): img_el_parent_div = cur_el.parent break cur_el = cur_el.parent if img_el_parent_div is not None and len(img_el_parent_div.text) != 0: img_el.attrs["alt"] = img_el_parent_div.text return # 找到父级的figure标签 img_el_parent_div = None cur_el = img_el while cur_el.parent is not None: if cur_el.parent is not None and cur_el.parent.name == 'figure': img_el_parent_div = cur_el.parent break cur_el = cur_el.parent if img_el_parent_div is not None and len(img_el_parent_div.text) != 0: img_el.attrs["alt"] = img_el_parent_div.text return def convert_figcaption(self, el, text, convert_as_inline): return "" # 图片后添加空行，图片应该单独在一行后面不接文字（示例文章：6925） def convert_img(self, el, text, convert_as_inline): self.try_complete_img_description(el) img_text = super().convert_img(el, text, convert_as_inline) # 5195 出现img标签内出现换行导致 markdown图片显示出现问题 img_text = img_text.replace("\r\n", "") img_text = img_text.replace("\n", "") # 空的img标签直接返回空行 if img_text == "![]()": return '\n\n' # img 标签使用父级超链接标签中的中大图 src = el.attrs.get('src', None) or '' if el.parent is not None and el.parent.name == "a": href = el.parent.attrs.get('href', None) or '' href_path = href.rsplit(".", 1)[0] src_path = src.rsplit(".", 1)[0] if href_path + "_s" == src_path: img_text = img_text.replace(src, href) if '_s' in img_text: src_path = src.rsplit(".", 1)[0] if src_path.endswith('_s'): original_src_path = src_path[:-2] # 去掉末尾的 '_s' # 构建原始 URL original_url = original_src_path + "." + src.split(".")[-1] if self.is_valid_url(original_url): img_text = img_text.replace(src, original_url) # 转换并下载图片 return picture_process.process_img_tag(img_text, self.img_download_path) @staticmethod def is_img_describe_strong(el) -> bool: if el is None or el.parent is None: return False if len(el.contents) == 0: return False # if not isinstance(el.contents[0], NavigableString): # return False img_list = el.parent.findAll("img") if len(img_list) == 0: return False for img_tag in img_list: alt = img_tag.get("alt", None) title = img_tag.get("title", None) if alt is None and title is None: continue if alt == el.text or title == el.text: return True return False def convert_b(self, el, text, convert_as_inline): # 如果b 标签下只存在一个标题，则该b不做任何处理，避免对标题进行加粗（示例文章：6925） if len(el.contents) == 1: title_level = get_markdown_title_level(el.contents[0]) if title_level != '': return text # 标签中存在标题时，不在对内容进行加粗 if str_is_title(text): return text if self.is_img_describe_strong(el): return "" text = text.strip(" \t") suffix = "" if text.endswith("\n"): suffix = " \n" b_text = super().convert_b(el, text, convert_as_inline) # 解析完标签后添加空格。避免出现markdown文档中出现《**1.****版本概述**》(文章2377 4292等) return " " + b_text + suffix + " " convert_strong = convert_b # 有可能出现
之后紧接一个标题hdwiki_tmml 故前后添加换行 def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return text if self.options['wrap']: text = fill(text, width=self.options['wrap_width'], break_long_words=False, break_on_hyphens=False) #
标签前后换行 return '\n\n%s\n\n' % text if text else '' def convert_a(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) if not text: return '' href = el.get('href') if self.is_href_img(href): return text title = el.get('title') # 5195 出现img标签内出现换行导致 markdown图片显示出现问题 if title is not None: title = title.replace("\n", "") # For the replacement see #29: text nodes underscores are escaped if (self.options['autolinks'] and text.replace(r'\_', '_') == href and not title and not self.options['default_title']): # Shortcut syntax return '<%s>' % href if self.options['default_title'] and not title: title = href title_part = ' "%s"' % title.replace('"', r'\"') if title else '' a_tag = '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text return a_tag @staticmethod def is_href_img(href_url) -> bool: if href_url is None: return False file_extension = href_url.split(".")[-1] # 不是图片不处理 file_extension = file_extension.lower() if file_extension not in ["jpg", "jpeg", "png", "gif"]: return False return True def convert_li(self, el, text, convert_as_inline): # 为空的li标签返回空(文章 4347) if not text.strip(): return "" li_text = super().convert_li(el, text, convert_as_inline) return li_text def convert_td(self, el, text, convert_as_inline): if "\r\n" in text: text = text.replace("\r\n", "
") if "\n" in text: text = text.replace("\n", "
") return ' ' + text + ' |' def convert_hn(self, n, el, text, convert_as_inline): if convert_as_inline: return text style = self.options['heading_style'].lower() text = text.rstrip() if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) hashes = '#' * n hashes = hashes + " " if style == ATX_CLOSED: return '\n\n %s %s %s\n\n' % (hashes, text, hashes) return '\n\n%s %s\n\n' % (hashes, text) @staticmethod def convert_thead_table(el, text, cell_name, convert_as_inline): cells = el.find_all(['td', 'th']) is_headrow = all([cell.name == cell_name for cell in cells]) overline = '' underline = '' if is_headrow and not el.previous_sibling: # first row and is headline: print headline underline underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' elif (not el.previous_sibling and (el.parent.name == 'table' or (el.parent.name == 'tbody' and not el.parent.previous_sibling))): # first row, not headline, and: # - the parent is table or # - the parent is tbody at the beginning of a table. # print empty headline above this row overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' return overline + '|' + text + '\n' + underline def convert_tr(self, el, text, convert_as_inline): # 解决table标签下存在thead的问题 (文章4061 1976) if el and el.parent and el.parent.name == "thead": return CustomMarkDownConverter.convert_thead_table(el, text, 'td', convert_as_inline) # 兼容 table->colgroup、tbody->tr 文章4364 if (el and el.parent and el.parent.previousSibling and el.parent.name == "tbody" and el.parent.previousSibling.name == "colgroup"): return CustomMarkDownConverter.convert_thead_table(el, text, 'td', convert_as_inline) return super().convert_tr(el, text, convert_as_inline) def convert_pre(self, el, text, convert_as_inline): # 文章5192出现pre标签，但内容不是代码。故不额外处理pre标签 return text def escape(self, text): if not text: return '' if self.options['escape_misc']: # text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text) text = re.sub(r'([\\&<`[>~#%=+|-])', r'\\\1', text) # 以下的转义是不必要的 # text = re.sub(r'([0-9])([.)])', r'\1\\\2', text) if self.options['escape_asterisks']: text = text.replace('*', r'\*') if self.options['escape_underscores']: text = text.replace('_', r'\_') return text @staticmethod def convert_span(el, text, convert_as_inline): # 文章3526出现图片后面紧接图片文本的问题。图片文本在span标签内 if "style" not in el.attrs: return text style_attr = el.attrs['style'] if style_attr is None: return text style_content = style_attr.split(';') # 遍历style属性内容，找到display的值 for item in style_content: if 'display' in item: display_value = item.split(': ')[1] # 获取冒号后的值 if display_value == "block" and text != "": return f"\n\n{text}\n\n" return text def expand_html_table(html) -> tuple[str, bool]: soup = BeautifulSoup(html, 'html.parser') tables = soup.find_all('table') if len(tables) == 0: return html, False for table in tables: # 创建一个二维列表来表示表格 table_rows = table.find_all('tr') max_cols = 0 for row in table_rows: cols = row.find_all(['td', 'th']) col_count = sum([int(col.get('colspan', 1)) for col in cols]) if col_count > max_cols: max_cols = col_count # 初始化一个二维列表来存储最终的表格 result_table = [] for _ in range(len(table_rows)): result_table.append([None] * max_cols) # 填充二维列表 for r, row in enumerate(table_rows): cols = row.find_all(['td', 'th']) c = 0 for col in cols: while result_table[r][c] is not None: c += 1 colspan = int(col.get('colspan', 1)) rowspan = int(col.get('rowspan', 1)) for i in range(rowspan): for j in range(colspan): # 拆分合并单元格时，重复内容 result_table[r + i][c + j] = copy.copy(col) # if j == 0 and i == 0: # result_table[r + i][c + j] = copy.copy(col) # else: # result_table[r + i][c + j] = soup.new_tag('td') c += colspan # 生成新的表格 HTML new_table = soup.new_tag('table', border="1", cellspacing="0") tbody = soup.new_tag('tbody') new_table.append(tbody) for row in result_table: tr = soup.new_tag('tr') for col in row: if col is not None: td = soup.new_tag(col.name) td.string = col.get_text() tr.append(td) tbody.append(tr) # 替换原始HTML中的旧表格 table.replace_with(new_table) return str(soup), True # Create shorthand method for conversion def md(html, img_download_path, **options): new_html, result = expand_html_table(html) markdown_content = CustomMarkDownConverter(img_download_path, **options).convert(new_html) # 删除换行符中间的空格 temp_txt = re.sub(r'\n\s*\n', '\n\n', markdown_content) # 连续超过3个以上的换行符替换为3个 temp_txt = re.sub(r'\n{3,}', '\n\n\n', temp_txt) return temp_txt