Skip to content

Commit da73d64

Browse files
committed
Initial work to port #55 to MarkItDown 0.1.X
1 parent 82d84e3 commit da73d64

File tree

6 files changed

+96
-1
lines changed

6 files changed

+96
-1
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ At present, MarkItDown supports:
1717
- PowerPoint
1818
- Word
1919
- Excel
20+
- OneNote
2021
- Images (EXIF metadata and OCR)
2122
- Audio (EXIF metadata and speech transcription)
2223
- HTML
@@ -82,6 +83,7 @@ At the moment, the following optional dependencies are available:
8283
* `[xls]` Installs dependencies for older Excel files
8384
* `[pdf]` Installs dependencies for PDF files
8485
* `[outlook]` Installs dependencies for Outlook messages
86+
* `[onenote]` Installs dependencies for OneNote .one files
8587
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
8688
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
8789
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription

packages/markitdown/pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,14 +45,16 @@ all = [
4545
"SpeechRecognition",
4646
"youtube-transcript-api",
4747
"azure-ai-documentintelligence",
48-
"azure-identity"
48+
"azure-identity",
49+
"one-extract",
4950
]
5051
pptx = ["python-pptx"]
5152
docx = ["mammoth"]
5253
xlsx = ["pandas", "openpyxl"]
5354
xls = ["pandas", "xlrd"]
5455
pdf = ["pdfminer.six"]
5556
outlook = ["olefile"]
57+
onenote = ["one-extract"]
5658
audio-transcription = ["pydub", "SpeechRecognition"]
5759
youtube-transcription = ["youtube-transcript-api"]
5860
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]

packages/markitdown/src/markitdown/_markitdown.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
BingSerpConverter,
3131
PdfConverter,
3232
DocxConverter,
33+
OneNoteConverter,
3334
XlsxConverter,
3435
XlsConverter,
3536
PptxConverter,
@@ -158,6 +159,7 @@ def enable_builtins(self, **kwargs) -> None:
158159
self.register_converter(YouTubeConverter())
159160
self.register_converter(BingSerpConverter())
160161
self.register_converter(DocxConverter())
162+
self.register_converter(OneNoteConverter())
161163
self.register_converter(XlsxConverter())
162164
self.register_converter(XlsConverter())
163165
self.register_converter(PptxConverter())

packages/markitdown/src/markitdown/converters/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from ._bing_serp_converter import BingSerpConverter
1212
from ._pdf_converter import PdfConverter
1313
from ._docx_converter import DocxConverter
14+
from ._onenote_converter import OneNoteConverter
1415
from ._xlsx_converter import XlsxConverter, XlsConverter
1516
from ._pptx_converter import PptxConverter
1617
from ._image_converter import ImageConverter
@@ -29,6 +30,7 @@
2930
"BingSerpConverter",
3031
"PdfConverter",
3132
"DocxConverter",
33+
"OneNoteConverter",
3234
"XlsxConverter",
3335
"XlsConverter",
3436
"PptxConverter",
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
import sys
2+
3+
from typing import BinaryIO, Any
4+
5+
from ._html_converter import HtmlConverter
6+
from .._base_converter import DocumentConverter, DocumentConverterResult
7+
from .._stream_info import StreamInfo
8+
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
9+
10+
# Try loading optional (but in this case, required) dependencies
11+
# Save reporting of any exceptions for later
12+
_dependency_exc_info = None
13+
try:
14+
import one_extract
15+
except ImportError:
16+
# Preserve the error and stack trace for later
17+
_dependency_exc_info = sys.exc_info()
18+
19+
20+
ACCEPTED_MIME_TYPE_PREFIXES = []
21+
22+
ACCEPTED_FILE_EXTENSIONS = [".one"]
23+
24+
25+
class OneNoteConverter(DocumentConverter):
26+
"""
27+
Converts OneNote files to Markdown.
28+
"""
29+
30+
def __init__(self):
31+
super().__init__()
32+
self._html_converter = HtmlConverter()
33+
34+
def accepts(
35+
self,
36+
file_stream: BinaryIO,
37+
stream_info: StreamInfo,
38+
**kwargs: Any, # Options to pass to the converter
39+
) -> bool:
40+
mimetype = (stream_info.mimetype or "").lower()
41+
extension = (stream_info.extension or "").lower()
42+
43+
if extension in ACCEPTED_FILE_EXTENSIONS:
44+
return True
45+
46+
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
47+
if mimetype.startswith(prefix):
48+
return True
49+
50+
return False
51+
52+
def convert(
53+
self,
54+
file_stream: BinaryIO,
55+
stream_info: StreamInfo,
56+
**kwargs: Any, # Options to pass to the converter
57+
) -> DocumentConverterResult:
58+
# Check: the dependencies
59+
if _dependency_exc_info is not None:
60+
raise MissingDependencyException(
61+
MISSING_DEPENDENCY_MESSAGE.format(
62+
converter=type(self).__name__,
63+
extension=".one",
64+
feature="onenote",
65+
)
66+
) from _dependency_exc_info[
67+
1
68+
].with_traceback( # type: ignore[union-attr]
69+
_dependency_exc_info[2]
70+
)
71+
72+
# Perform the conversion
73+
md_content = ""
74+
notebook = one_extract.Notebook(file_stream)
75+
for section in notebook.sections:
76+
md_content += f"\n\n# {section.name}\n"
77+
for page in section.pages:
78+
md_content += f"\n\n## {page.name}\n"
79+
md_content += (
80+
self._html_converter.convert_string(page.content).markdown.strip()
81+
+ "\n\n"
82+
)
83+
84+
return DocumentConverterResult(
85+
title=None,
86+
text_content=md_content.strip(),
87+
)
161 KB
Binary file not shown.

0 commit comments

Comments
 (0)