22import io
33import json
44import os
5+ import math
56from dataclasses import dataclass
7+ from collections import defaultdict
68from pathlib import Path
79from typing import Dict , List , Optional , Union
810
3032
3133INFO_MESSAGES = io .StringIO ()
3234GRAPHICS_TEXT = "\n \n "
33- CHECK_OCR_TEXT = {"ignore-text" }
3435OCR_FONTNAME = "GlyphLessFont" # if encountered do not use "code" style
3536FLAGS = (
3637 0
4344BULLETS = tuple (utils .BULLETS )
4445
4546
47+ def make_page_chunk (doc , page , text ) -> Dict :
48+ """Create a page chunk dictionary for output.
49+
50+ Args:
51+ doc: the ParsedDocument object
52+ page: the PageLayout object
53+ text: the page text string
54+
55+ Returns:
56+ dict: page chunk dictionary
57+ """
58+ chunk = defaultdict (lambda : None )
59+ page_tocs = [t for t in doc .toc if t [- 1 ] == page .page_number ]
60+ chunk ["metadata" ] = doc .metadata | {
61+ "file_path" : doc .filename ,
62+ "page_count" : doc .page_count ,
63+ "page_number" : page .page_number ,
64+ }
65+
66+ chunk ["toc_items" ] = page_tocs
67+ chunk ["page_boxes" ] = [
68+ (
69+ math .floor (b .x0 ),
70+ math .floor (b .y0 ),
71+ math .ceil (b .x1 ),
72+ math .ceil (b .y1 ),
73+ b .boxclass ,
74+ )
75+ for b in page .boxes
76+ ]
77+ chunk ["text" ] = text
78+ return chunk
79+
80+
4681def omit_if_pua_char (text ):
4782 """Check if character is in the Private Use Area (PUA) of Unicode."""
4883 if len (text ) != 1 : # only single characters are checked
@@ -594,7 +629,7 @@ class ParsedDocument:
594629 image_dpi : int = 150 # image resolution
595630 image_format : str = "png" # 'png' or 'jpg'
596631 image_path : str = "" # path to save images
597- use_ocr : bool = True # whether to invoke OCR if beneficial
632+ use_ocr : bool = True # if beneficial invoke OCR
598633
599634 def to_markdown (
600635 self ,
@@ -605,18 +640,24 @@ def to_markdown(
605640 ignore_code : bool = False ,
606641 show_progress : bool = False ,
607642 page_separators : bool = False ,
643+ page_chunks : bool = False ,
608644 ** kwargs ,
609- ) -> str :
645+ ) -> Union [ str , List [ Dict ]] :
610646 """
611647 Serialize ParsedDocument to markdown text.
612648 """
613- output = ""
649+ if page_chunks :
650+ document_output = []
651+ else :
652+ document_output = ""
653+
614654 if show_progress and len (self .pages ) > 5 :
615655 print (f"Generating markdown text..." )
616656 this_iterator = ProgressBar (self .pages )
617657 else :
618658 this_iterator = self .pages
619659 for page in this_iterator :
660+ md_string = ""
620661 # Make a mapping: box number -> list item hierarchy level
621662 list_item_levels = create_list_item_levels (page .boxes )
622663
@@ -633,25 +674,25 @@ def to_markdown(
633674 # pictures and formulas: either write image file or embed
634675 if btype in ("picture" , "formula" , "table-fallback" ):
635676 if isinstance (box .image , str ):
636- output += GRAPHICS_TEXT % box .image + "\n \n "
677+ md_string += GRAPHICS_TEXT % box .image + "\n \n "
637678 elif isinstance (box .image , bytes ):
638679 # make a base64 encoded string of the image
639680 data = base64 .b64encode (box .image ).decode ()
640681 data = f"data:image/{ self .image_format } ;base64," + data
641- output += GRAPHICS_TEXT % data + "\n \n "
682+ md_string += GRAPHICS_TEXT % data + "\n \n "
642683 else :
643- output += f"**==> picture [{ clip .width } x { clip .height } ] intentionally omitted <==**\n \n "
684+ md_string += f"**==> picture [{ clip .width } x { clip .height } ] intentionally omitted <==**\n \n "
644685
645686 # output text in image if requested
646687 if box .textlines :
647688 if btype == "picture" :
648- output += picture_text_to_md (
689+ md_string += picture_text_to_md (
649690 box .textlines ,
650691 ignore_code = ignore_code or page .full_ocred ,
651692 clip = clip ,
652693 )
653694 elif btype == "table-fallback" :
654- output += fallback_text_to_md (
695+ md_string += fallback_text_to_md (
655696 box .textlines ,
656697 ignore_code = ignore_code or page .full_ocred ,
657698 clip = clip ,
@@ -662,30 +703,35 @@ def to_markdown(
662703 if page .full_ocred :
663704 # remove code style if page was OCR'd
664705 table_text = table_text .replace ("`" , "" )
665- output += table_text + "\n \n "
706+ md_string += table_text + "\n \n "
666707 continue
667708 if not hasattr (box , "textlines" ):
668709 print (f"Warning: box { btype } has no textlines" )
669710 continue
670711 if btype == "title" :
671- output += title_to_md (box .textlines )
712+ md_string += title_to_md (box .textlines )
672713 elif btype == "section-header" :
673- output += section_hdr_to_md (box .textlines )
714+ md_string += section_hdr_to_md (box .textlines )
674715 elif btype == "list-item" :
675- output += list_item_to_md (box .textlines , list_item_levels [i ])
716+ md_string += list_item_to_md (box .textlines , list_item_levels [i ])
676717 elif btype == "footnote" :
677- output += footnote_to_md (box .textlines )
718+ md_string += footnote_to_md (box .textlines )
678719 elif not header and btype == "page-header" :
679720 continue
680721 elif not footer and btype == "page-footer" :
681722 continue
682723 else : # treat as normal MD text
683- output += text_to_md (
724+ md_string += text_to_md (
684725 box .textlines , ignore_code = ignore_code or page .full_ocred
685726 )
686727 if page_separators :
687- output += f"--- end of { page .page_number = } ---\n \n "
688- return output
728+ md_string += f"--- end of { page .page_number = } ---\n \n "
729+ if not page_chunks :
730+ document_output += md_string
731+ else :
732+ chunk = make_page_chunk (self , page , md_string )
733+ document_output .append (chunk )
734+ return document_output
689735
690736 def to_json (self , show_progress = False ) -> str :
691737 # Serialize to JSON
@@ -717,22 +763,29 @@ def to_text(
717763 footer : bool = True ,
718764 ignore_code : bool = False ,
719765 show_progress : bool = False ,
766+ page_chunks : bool = False ,
720767 table_format : str = "grid" ,
721- ) -> str :
768+ ** kwargs ,
769+ ) -> Union [str , List [Dict ]]:
722770 """
723771 Serialize ParsedDocument to plain text. Optionally omit page headers or footers.
724772 """
725773 if table_format not in tabulate .tabulate_formats :
726774 print (f"Warning: invalid table format '{ table_format } ', using 'grid'." )
727775 table_format = "grid"
728- # Flatten all text boxes into plain text
729- output = ""
776+
777+ if page_chunks :
778+ document_output = []
779+ else :
780+ document_output = ""
781+
730782 if show_progress and len (self .pages ) > 5 :
731783 print (f"Generating plain text .." )
732784 this_iterator = ProgressBar (self .pages )
733785 else :
734786 this_iterator = self .pages
735787 for page in this_iterator :
788+ text_string = ""
736789 list_item_levels = create_list_item_levels (page .boxes )
737790 for i , box in enumerate (page .boxes ):
738791 clip = pymupdf .IRect (box .x0 , box .y0 , box .x1 , box .y1 )
@@ -742,38 +795,43 @@ def to_text(
742795 if btype == "page-footer" and footer is False :
743796 continue
744797 if btype in ("picture" , "formula" , "table-fallback" ):
745- output += f"==> picture [{ clip .width } x { clip .height } ] <==\n \n "
798+ text_string += f"==> picture [{ clip .width } x { clip .height } ] <==\n \n "
746799 if box .textlines :
747800 if btype == "picture" :
748- output += picture_text_to_text (
801+ text_string += picture_text_to_text (
749802 box .textlines ,
750803 ignore_code = ignore_code or page .full_ocred ,
751804 clip = clip ,
752805 )
753806 elif btype == "table-fallback" :
754- output += fallback_text_to_text (
807+ text_string += fallback_text_to_text (
755808 box .textlines ,
756809 ignore_code = ignore_code or page .full_ocred ,
757810 clip = clip ,
758811 )
759812 continue
760813 if btype == "table" :
761- output += (
814+ text_string += (
762815 tabulate .tabulate (box .table ["extract" ], tablefmt = table_format )
763816 + "\n \n "
764817 )
765818 continue
766819 if btype == "list-item" :
767- output += list_item_to_text (box .textlines , list_item_levels [i ])
820+ text_string += list_item_to_text (box .textlines , list_item_levels [i ])
768821 continue
769822 if btype == "footnote" :
770- output += footnote_to_text (box .textlines )
823+ text_string += footnote_to_text (box .textlines )
771824 continue
772- output += text_to_text (
825+ text_string += text_to_text (
773826 box .textlines , ignore_code = ignore_code or page .full_ocred
774827 )
775- continue
776- return output
828+
829+ if not page_chunks :
830+ document_output += text_string
831+ else :
832+ chunk = make_page_chunk (self , page , text_string )
833+ document_output .append (chunk )
834+ return document_output
777835
778836
779837def parse_document (
0 commit comments