Coverage for src/mdslicer/mdslicer.py: 100%

1"""

2Parse markdown file into header and sections.

3A header is a dictionary with the metadata of the markdown file.

4Sections are a list of dictionaries with the title, id and content of each section.

5For example:

7.. highlight:: python

8.. code-block:: python

10 sections =

11 [{'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'},

12 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}]

14"""

16from __future__ import annotations # for compatibility with Python < 3.10

17from pathlib import Path

18from typing import Callable

20import bs4

21import frontmatter

22import markdown # type: ignore

23from markdown.extensions.toc import slugify

26def split_header_and_content(file_content: str) -> tuple[dict, str]:

27 """

28 Split a markdown file content into a YAML header and a content

30 Args:

31 file_content: content of the markdown file

33 Returns:

34 header of the markdown file,

35 content of the markdown file

36 """

37 header, md_content = frontmatter.parse(file_content)

38 return header, md_content

41class MDSlicer:

42 """

43 Parse markdown content into metadata header and sections

44 """

46 def __init__(self, additional_parser: Callable | None = None, **kwargs):

47 """

48 Create a markdown parser with the given extensions.

51 Args:

52 additional_parser: Additional parser to apply on the markdown content

53 kwargs: Keyword arguments to pass to the `markdown.Markdown() <https://python-markdown.github.io/reference/#Markdown>`_ parser initializer (such as the list of extensions)

54 """

55 self.md = markdown.Markdown(**kwargs)

56 self.md.reset()

57 self.additional_parser = additional_parser

59 def slice_md_content(self, md_content: str) -> list[dict[str, str]]:

60 """

61 Convert markdown content to HTML sections.

63 Args:

64 md_content: Markdown content

66 Returns:

67 List of sections

69 Example:

70 >>> from mdslicer import MDSlicer

71 >>> slicer = MDSlicer()

72 >>> md_content = '''

73 ... # Title

74 ...

75 ... Some content

76 ...

77 ... ## Section 1

78 ...

79 ... Content 1

80 ...

81 ... ## Section 2

82 ...

83 ... Content 2'''

84 >>> slicer.slice_md_content(md_content) # doctest: +NORMALIZE_WHITESPACE

85 [{'title': '', 'id': '', 'content': '<h1>Title</h1>\\n<p>Some content</p>\\n'},

86 {'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'},

87 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}]

88 """

89 if self.additional_parser:

90 md_content = self.additional_parser(md_content)

91 self.md.reset()

92 html = self.md.convert(md_content)

93 sections = self.get_sections(html)

95 return sections

97 def get_sections(self, html: str) -> list[dict[str, str]]:

98 """

99 Get sections from the HTML content by splitting it with h2 tags

100

101 Args:

102 html: HTML content

103

104 Returns:

105 List of sections with an id, a title and an html content

106

107 Example:

108 >>> from mdslicer import MDSlicer

109 >>> slicer = MDSlicer()

110 >>> html = "<h2>Section 1</h2><p>Content 1</p><h2>Section 2</h2><p>Content 2</p>"

111 >>> slicer.get_sections(html) # doctest: +NORMALIZE_WHITESPACE

112 [{'title': 'Section 1', 'id': 'section-1', 'content': '<p>Content 1</p>'},

113 {'title': 'Section 2', 'id': 'section-2', 'content': '<p>Content 2</p>'}]

114 """

115

116 # Build section dict

117 soup = bs4.BeautifulSoup(html, "html.parser")

118 sections = []

119

120 # If section does not start with a h2 tag

121 no_h2_section = ""

122 for tag in soup:

123 if tag.name == "h2": # type: ignore

124 break

125 else:

126 no_h2_section += str(tag)

127

128 if no_h2_section:

129 sections.append({"title": "", "id": "", "content": no_h2_section})

130

131 # Parse the rest

132 for h2 in soup.find_all("h2"):

133 title = h2.text

134 content = ""

135 for tag in h2.next_siblings:

136 if tag.name == "h2": # type: ignore

137 break

138 content += str(tag)

139 section = {"title": title, "id": slugify(title, "-"), "content": content}

140 sections.append(section)

141

142 return sections

143

144 def slice_content(self, file_content: str) -> tuple[dict, list[dict[str, str]]]:

145 """

146 Parse a markdown string into a YAML header and a content

147

148 Args:

149 file_content: content of the markdown file

150

151 Returns:

152 header of the markdown file,

153 content sections of the markdown file

154

155 Examples:

156 >>> slicer = MDSlicer()

157 >>> file_content = '''

158 ... ---

159 ... title: Example

160 ... ---

161 ...

162 ... ## Section 1

163 ...

164 ... Content 1

165 ...

166 ... ## Section 2

167 ...

168 ... Content 2'''

169 >>> header, sections = slicer.slice_content(file_content)

170 >>> print(header)

171 {'title': 'Example'}

172 >>> sections # doctest: +NORMALIZE_WHITESPACE

173 [{'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'},

174 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}]

175

176 """

177 header, md_content = split_header_and_content(file_content)

178 sections = self.slice_md_content(md_content)

179 return header, sections

180

181 def slice_file(self, mdfile_path: str | Path) -> tuple[dict, list[dict[str, str]]]:

182 """

183 Parse a markdown file into a YAML header and a content

184

185 Args:

186 mdfile_path: Path to the markdown file

187

188 Returns:

189 header of the markdown file,

190 content sections of the markdown file,

191 """

192 mdfile_path = Path(mdfile_path)

193 file_content = mdfile_path.read_text()

194 return self.slice_content(file_content)