Coverage for src/mdslicer/mdslicer.py: 100%

50 statements  

« prev     ^ index     » next       coverage.py v7.7.1, created at 2025-03-24 10:06 +0000

1""" 

2Parse markdown file into header and sections. 

3A header is a dictionary with the metadata of the markdown file. 

4Sections are a list of dictionaries with the title, id and content of each section. 

5For example: 

6 

7.. highlight:: python 

8.. code-block:: python 

9 

10 sections = 

11 [{'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'}, 

12 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}] 

13 

14""" 

15 

16from __future__ import annotations # for compatibility with Python < 3.10 

17from pathlib import Path 

18from typing import Callable 

19 

20import bs4 

21import frontmatter 

22import markdown # type: ignore 

23from markdown.extensions.toc import slugify 

24 

25 

26def split_header_and_content(file_content: str) -> tuple[dict, str]: 

27 """ 

28 Split a markdown file content into a YAML header and a content 

29 

30 Args: 

31 file_content: content of the markdown file 

32 

33 Returns: 

34 header of the markdown file, 

35 content of the markdown file 

36 """ 

37 header, md_content = frontmatter.parse(file_content) 

38 return header, md_content 

39 

40 

41class MDSlicer: 

42 """ 

43 Parse markdown content into metadata header and sections 

44 """ 

45 

46 def __init__(self, additional_parser: Callable | None = None, **kwargs): 

47 """ 

48 Create a markdown parser with the given extensions. 

49 

50 

51 Args: 

52 additional_parser: Additional parser to apply on the markdown content 

53 kwargs: Keyword arguments to pass to the `markdown.Markdown() <https://python-markdown.github.io/reference/#Markdown>`_ parser initializer (such as the list of extensions) 

54 """ 

55 self.md = markdown.Markdown(**kwargs) 

56 self.md.reset() 

57 self.additional_parser = additional_parser 

58 

59 def slice_md_content(self, md_content: str) -> list[dict[str, str]]: 

60 """ 

61 Convert markdown content to HTML sections. 

62 

63 Args: 

64 md_content: Markdown content 

65 

66 Returns: 

67 List of sections 

68 

69 Example: 

70 >>> from mdslicer import MDSlicer 

71 >>> slicer = MDSlicer() 

72 >>> md_content = ''' 

73 ... # Title 

74 ... 

75 ... Some content 

76 ... 

77 ... ## Section 1 

78 ... 

79 ... Content 1 

80 ... 

81 ... ## Section 2 

82 ... 

83 ... Content 2''' 

84 >>> slicer.slice_md_content(md_content) # doctest: +NORMALIZE_WHITESPACE 

85 [{'title': '', 'id': '', 'content': '<h1>Title</h1>\\n<p>Some content</p>\\n'}, 

86 {'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'}, 

87 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}] 

88 """ 

89 if self.additional_parser: 

90 md_content = self.additional_parser(md_content) 

91 self.md.reset() 

92 html = self.md.convert(md_content) 

93 sections = self.get_sections(html) 

94 

95 return sections 

96 

97 def get_sections(self, html: str) -> list[dict[str, str]]: 

98 """ 

99 Get sections from the HTML content by splitting it with h2 tags 

100 

101 Args: 

102 html: HTML content 

103 

104 Returns: 

105 List of sections with an id, a title and an html content 

106 

107 Example: 

108 >>> from mdslicer import MDSlicer 

109 >>> slicer = MDSlicer() 

110 >>> html = "<h2>Section 1</h2><p>Content 1</p><h2>Section 2</h2><p>Content 2</p>" 

111 >>> slicer.get_sections(html) # doctest: +NORMALIZE_WHITESPACE 

112 [{'title': 'Section 1', 'id': 'section-1', 'content': '<p>Content 1</p>'}, 

113 {'title': 'Section 2', 'id': 'section-2', 'content': '<p>Content 2</p>'}] 

114 """ 

115 

116 # Build section dict 

117 soup = bs4.BeautifulSoup(html, "html.parser") 

118 sections = [] 

119 

120 # If section does not start with a h2 tag 

121 no_h2_section = "" 

122 for tag in soup: 

123 if tag.name == "h2": # type: ignore 

124 break 

125 else: 

126 no_h2_section += str(tag) 

127 

128 if no_h2_section: 

129 sections.append({"title": "", "id": "", "content": no_h2_section}) 

130 

131 # Parse the rest 

132 for h2 in soup.find_all("h2"): 

133 title = h2.text 

134 content = "" 

135 for tag in h2.next_siblings: 

136 if tag.name == "h2": # type: ignore 

137 break 

138 content += str(tag) 

139 section = {"title": title, "id": slugify(title, "-"), "content": content} 

140 sections.append(section) 

141 

142 return sections 

143 

144 def slice_content(self, file_content: str) -> tuple[dict, list[dict[str, str]]]: 

145 """ 

146 Parse a markdown string into a YAML header and a content 

147 

148 Args: 

149 file_content: content of the markdown file 

150 

151 Returns: 

152 header of the markdown file, 

153 content sections of the markdown file 

154 

155 Examples: 

156 >>> slicer = MDSlicer() 

157 >>> file_content = ''' 

158 ... --- 

159 ... title: Example 

160 ... --- 

161 ... 

162 ... ## Section 1 

163 ... 

164 ... Content 1 

165 ... 

166 ... ## Section 2 

167 ... 

168 ... Content 2''' 

169 >>> header, sections = slicer.slice_content(file_content) 

170 >>> print(header) 

171 {'title': 'Example'} 

172 >>> sections # doctest: +NORMALIZE_WHITESPACE 

173 [{'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'}, 

174 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}] 

175 

176 """ 

177 header, md_content = split_header_and_content(file_content) 

178 sections = self.slice_md_content(md_content) 

179 return header, sections 

180 

181 def slice_file(self, mdfile_path: str | Path) -> tuple[dict, list[dict[str, str]]]: 

182 """ 

183 Parse a markdown file into a YAML header and a content 

184 

185 Args: 

186 mdfile_path: Path to the markdown file 

187 

188 Returns: 

189 header of the markdown file, 

190 content sections of the markdown file, 

191 """ 

192 mdfile_path = Path(mdfile_path) 

193 file_content = mdfile_path.read_text() 

194 return self.slice_content(file_content)