Coverage for src/mdslicer/mdslicer.py: 100%
50 statements
« prev ^ index » next coverage.py v7.7.1, created at 2025-03-24 10:06 +0000
« prev ^ index » next coverage.py v7.7.1, created at 2025-03-24 10:06 +0000
1"""
2Parse markdown file into header and sections.
3A header is a dictionary with the metadata of the markdown file.
4Sections are a list of dictionaries with the title, id and content of each section.
5For example:
7.. highlight:: python
8.. code-block:: python
10 sections =
11 [{'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'},
12 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}]
14"""
16from __future__ import annotations # for compatibility with Python < 3.10
17from pathlib import Path
18from typing import Callable
20import bs4
21import frontmatter
22import markdown # type: ignore
23from markdown.extensions.toc import slugify
26def split_header_and_content(file_content: str) -> tuple[dict, str]:
27 """
28 Split a markdown file content into a YAML header and a content
30 Args:
31 file_content: content of the markdown file
33 Returns:
34 header of the markdown file,
35 content of the markdown file
36 """
37 header, md_content = frontmatter.parse(file_content)
38 return header, md_content
41class MDSlicer:
42 """
43 Parse markdown content into metadata header and sections
44 """
46 def __init__(self, additional_parser: Callable | None = None, **kwargs):
47 """
48 Create a markdown parser with the given extensions.
51 Args:
52 additional_parser: Additional parser to apply on the markdown content
53 kwargs: Keyword arguments to pass to the `markdown.Markdown() <https://python-markdown.github.io/reference/#Markdown>`_ parser initializer (such as the list of extensions)
54 """
55 self.md = markdown.Markdown(**kwargs)
56 self.md.reset()
57 self.additional_parser = additional_parser
59 def slice_md_content(self, md_content: str) -> list[dict[str, str]]:
60 """
61 Convert markdown content to HTML sections.
63 Args:
64 md_content: Markdown content
66 Returns:
67 List of sections
69 Example:
70 >>> from mdslicer import MDSlicer
71 >>> slicer = MDSlicer()
72 >>> md_content = '''
73 ... # Title
74 ...
75 ... Some content
76 ...
77 ... ## Section 1
78 ...
79 ... Content 1
80 ...
81 ... ## Section 2
82 ...
83 ... Content 2'''
84 >>> slicer.slice_md_content(md_content) # doctest: +NORMALIZE_WHITESPACE
85 [{'title': '', 'id': '', 'content': '<h1>Title</h1>\\n<p>Some content</p>\\n'},
86 {'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'},
87 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}]
88 """
89 if self.additional_parser:
90 md_content = self.additional_parser(md_content)
91 self.md.reset()
92 html = self.md.convert(md_content)
93 sections = self.get_sections(html)
95 return sections
97 def get_sections(self, html: str) -> list[dict[str, str]]:
98 """
99 Get sections from the HTML content by splitting it with h2 tags
101 Args:
102 html: HTML content
104 Returns:
105 List of sections with an id, a title and an html content
107 Example:
108 >>> from mdslicer import MDSlicer
109 >>> slicer = MDSlicer()
110 >>> html = "<h2>Section 1</h2><p>Content 1</p><h2>Section 2</h2><p>Content 2</p>"
111 >>> slicer.get_sections(html) # doctest: +NORMALIZE_WHITESPACE
112 [{'title': 'Section 1', 'id': 'section-1', 'content': '<p>Content 1</p>'},
113 {'title': 'Section 2', 'id': 'section-2', 'content': '<p>Content 2</p>'}]
114 """
116 # Build section dict
117 soup = bs4.BeautifulSoup(html, "html.parser")
118 sections = []
120 # If section does not start with a h2 tag
121 no_h2_section = ""
122 for tag in soup:
123 if tag.name == "h2": # type: ignore
124 break
125 else:
126 no_h2_section += str(tag)
128 if no_h2_section:
129 sections.append({"title": "", "id": "", "content": no_h2_section})
131 # Parse the rest
132 for h2 in soup.find_all("h2"):
133 title = h2.text
134 content = ""
135 for tag in h2.next_siblings:
136 if tag.name == "h2": # type: ignore
137 break
138 content += str(tag)
139 section = {"title": title, "id": slugify(title, "-"), "content": content}
140 sections.append(section)
142 return sections
144 def slice_content(self, file_content: str) -> tuple[dict, list[dict[str, str]]]:
145 """
146 Parse a markdown string into a YAML header and a content
148 Args:
149 file_content: content of the markdown file
151 Returns:
152 header of the markdown file,
153 content sections of the markdown file
155 Examples:
156 >>> slicer = MDSlicer()
157 >>> file_content = '''
158 ... ---
159 ... title: Example
160 ... ---
161 ...
162 ... ## Section 1
163 ...
164 ... Content 1
165 ...
166 ... ## Section 2
167 ...
168 ... Content 2'''
169 >>> header, sections = slicer.slice_content(file_content)
170 >>> print(header)
171 {'title': 'Example'}
172 >>> sections # doctest: +NORMALIZE_WHITESPACE
173 [{'title': 'Section 1', 'id': 'section-1', 'content': '\\n<p>Content 1</p>\\n'},
174 {'title': 'Section 2', 'id': 'section-2', 'content': '\\n<p>Content 2</p>'}]
176 """
177 header, md_content = split_header_and_content(file_content)
178 sections = self.slice_md_content(md_content)
179 return header, sections
181 def slice_file(self, mdfile_path: str | Path) -> tuple[dict, list[dict[str, str]]]:
182 """
183 Parse a markdown file into a YAML header and a content
185 Args:
186 mdfile_path: Path to the markdown file
188 Returns:
189 header of the markdown file,
190 content sections of the markdown file,
191 """
192 mdfile_path = Path(mdfile_path)
193 file_content = mdfile_path.read_text()
194 return self.slice_content(file_content)