import re
from textwrap import fill
import requests
from bs4 import NavigableString
from bs4 import BeautifulSoup
from markdownify import MarkdownConverter, chomp, UNDERLINED, ATX_CLOSED
import copy
from . import picture_process
#
是否是单元格内部的换行符
def judge_br_in_table(el):
if el.name in ['td', 'tr']:
return True
if el.parent is None:
return False
# 递归父级元素
return judge_br_in_table(el.parent)
# 获取div标签中是否为标题,如果是标题则markdown中的返回标题等级
def get_markdown_title_level(el):
if el.name != 'div' or 'class' not in el.attrs:
return ''
title_level = ''
if 'hdwiki_tmml' in el.attrs['class']:
title_level = '## '
elif 'hdwiki_tmmll' in el.attrs['class']:
title_level = '### '
return title_level
def str_is_title(text) -> bool:
text = text.strip()
pattern = r'^#+'
# 使用re.search匹配字符串开头的 # 符号
match = re.search(pattern, text)
if match:
return True
else:
return False
# 判断el 是否是图片的DIV标签
def is_img_div_tag(el) -> bool:
if el is None:
return False
if el.name != "div":
return False
class_attr = el.get('class')
if class_attr is None:
return False
if "img" in class_attr or "img_l" in class_attr:
return True
else:
return False
# 判断div内部是否是纯文本内容,并且display是否为block
def is_only_text_div(el) -> bool:
if el is None or el.name != "div" or el.text == "":
return False
if el.get("display", "block") != "block":
return False
# div标签下只包含文本
if isinstance(el.string, NavigableString):
return True
# 兼容
之后紧接一个标题hdwiki_tmml 故前后添加换行 def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return text if self.options['wrap']: text = fill(text, width=self.options['wrap_width'], break_long_words=False, break_on_hyphens=False) #
标签前后换行
return '\n\n%s\n\n' % text if text else ''
def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
if not text:
return ''
href = el.get('href')
if self.is_href_img(href):
return text
title = el.get('title')
# 5195 出现img标签内出现换行导致 markdown图片显示出现问题
if title is not None:
title = title.replace("\n", "")
# For the replacement see #29: text nodes underscores are escaped
if (self.options['autolinks']
and text.replace(r'\_', '_') == href
and not title
and not self.options['default_title']):
# Shortcut syntax
return '<%s>' % href
if self.options['default_title'] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
a_tag = '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
return a_tag
@staticmethod
def is_href_img(href_url) -> bool:
if href_url is None:
return False
file_extension = href_url.split(".")[-1]
# 不是图片不处理
file_extension = file_extension.lower()
if file_extension not in ["jpg", "jpeg", "png", "gif"]:
return False
return True
def convert_li(self, el, text, convert_as_inline):
# 为空的li标签返回空(文章 4347)
if not text.strip():
return ""
li_text = super().convert_li(el, text, convert_as_inline)
return li_text
def convert_td(self, el, text, convert_as_inline):
if "\r\n" in text:
text = text.replace("\r\n", "
")
if "\n" in text:
text = text.replace("\n", "
")
return ' ' + text + ' |'
def convert_hn(self, n, el, text, convert_as_inline):
if convert_as_inline:
return text
style = self.options['heading_style'].lower()
text = text.rstrip()
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
return self.underline(text, line)
hashes = '#' * n
hashes = hashes + " "
if style == ATX_CLOSED:
return '\n\n %s %s %s\n\n' % (hashes, text, hashes)
return '\n\n%s %s\n\n' % (hashes, text)
@staticmethod
def convert_thead_table(el, text, cell_name, convert_as_inline):
cells = el.find_all(['td', 'th'])
is_headrow = all([cell.name == cell_name for cell in cells])
overline = ''
underline = ''
if is_headrow and not el.previous_sibling:
# first row and is headline: print headline underline
underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
elif (not el.previous_sibling
and (el.parent.name == 'table'
or (el.parent.name == 'tbody'
and not el.parent.previous_sibling))):
# first row, not headline, and:
# - the parent is table or
# - the parent is tbody at the beginning of a table.
# print empty headline above this row
overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
return overline + '|' + text + '\n' + underline
def convert_tr(self, el, text, convert_as_inline):
# 解决table标签下存在thead的问题 (文章4061 1976)
if el and el.parent and el.parent.name == "thead":
return CustomMarkDownConverter.convert_thead_table(el, text, 'td', convert_as_inline)
# 兼容 table->colgroup、tbody->tr 文章4364
if (el and el.parent and el.parent.previousSibling
and el.parent.name == "tbody"
and el.parent.previousSibling.name == "colgroup"):
return CustomMarkDownConverter.convert_thead_table(el, text, 'td', convert_as_inline)
return super().convert_tr(el, text, convert_as_inline)
def convert_pre(self, el, text, convert_as_inline):
# 文章5192出现pre标签,但内容不是代码。故不额外处理pre标签
return text
def escape(self, text):
if not text:
return ''
if self.options['escape_misc']:
# text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
text = re.sub(r'([\\&<`[>~#%=+|-])', r'\\\1', text)
# 以下的转义是不必要的
# text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
if self.options['escape_asterisks']:
text = text.replace('*', r'\*')
if self.options['escape_underscores']:
text = text.replace('_', r'\_')
return text
@staticmethod
def convert_span(el, text, convert_as_inline):
# 文章3526出现图片后面紧接图片文本的问题。图片文本在span标签内
if "style" not in el.attrs:
return text
style_attr = el.attrs['style']
if style_attr is None:
return text
style_content = style_attr.split(';')
# 遍历style属性内容,找到display的值
for item in style_content:
if 'display' in item:
display_value = item.split(': ')[1] # 获取冒号后的值
if display_value == "block" and text != "":
return f"\n\n{text}\n\n"
return text
def expand_html_table(html) -> tuple[str, bool]:
soup = BeautifulSoup(html, 'html.parser')
tables = soup.find_all('table')
if len(tables) == 0:
return html, False
for table in tables:
# 创建一个二维列表来表示表格
table_rows = table.find_all('tr')
max_cols = 0
for row in table_rows:
cols = row.find_all(['td', 'th'])
col_count = sum([int(col.get('colspan', 1)) for col in cols])
if col_count > max_cols:
max_cols = col_count
# 初始化一个二维列表来存储最终的表格
result_table = []
for _ in range(len(table_rows)):
result_table.append([None] * max_cols)
# 填充二维列表
for r, row in enumerate(table_rows):
cols = row.find_all(['td', 'th'])
c = 0
for col in cols:
while result_table[r][c] is not None:
c += 1
colspan = int(col.get('colspan', 1))
rowspan = int(col.get('rowspan', 1))
for i in range(rowspan):
for j in range(colspan):
# 拆分合并单元格时,重复内容
result_table[r + i][c + j] = copy.copy(col)
# if j == 0 and i == 0:
# result_table[r + i][c + j] = copy.copy(col)
# else:
# result_table[r + i][c + j] = soup.new_tag('td')
c += colspan
# 生成新的表格 HTML
new_table = soup.new_tag('table', border="1", cellspacing="0")
tbody = soup.new_tag('tbody')
new_table.append(tbody)
for row in result_table:
tr = soup.new_tag('tr')
for col in row:
if col is not None:
td = soup.new_tag(col.name)
td.string = col.get_text()
tr.append(td)
tbody.append(tr)
# 替换原始HTML中的旧表格
table.replace_with(new_table)
return str(soup), True
# Create shorthand method for conversion
def md(html, img_download_path, **options):
new_html, result = expand_html_table(html)
markdown_content = CustomMarkDownConverter(img_download_path, **options).convert(new_html)
# 删除换行符中间的空格
temp_txt = re.sub(r'\n\s*\n', '\n\n', markdown_content)
# 连续超过3个以上的换行符替换为3个
temp_txt = re.sub(r'\n{3,}', '\n\n\n', temp_txt)
return temp_txt