Source code for maeser.admin_portal.extract_figures

# SPDX-License-Identifier: LGPL-3.0-or-later

"""
This module is used to extract figures from a series of pdfs in a directory
It does this by looking for "Figure X.Y" in the text and extracting a screenshot of the page
cropped around this text.
This procedure is *very* rudimentary and could use much improvement.
This module can be executed in the terminal or used within another script.
"""

import sys
import os
import re
import pymupdf

[docs] def extract_figures_with_captions(pdf_path, output_dir): os.makedirs(output_dir, exist_ok=True) doc = pymupdf.open(pdf_path) figures_extracted = 0 # Desired size in points (600x400 px @ 300 dpi) width_pt = 600 # 600px height_pt = 400 # 400px padding_pt = 16 for page_index in range(len(doc)): page = doc[page_index] text_blocks = page.get_text("dict")["blocks"] for block in text_blocks: for line in block.get("lines", []): for span in line.get("spans", []): text = span["text"] match = re.match(r"(Figure\s+(\d+\.\d+))", text, re.IGNORECASE) if match: fig_label = match.group(2) caption_rect = pymupdf.Rect(span["bbox"]) center_x = (caption_rect.x0 + caption_rect.x1) / 2 center_y = (caption_rect.y0 + caption_rect.y1) / 2 # Define fixed-size capture rectangle centered on the caption x0 = center_x - (width_pt / 2 + padding_pt) y0 = center_y - (height_pt + padding_pt) x1 = center_x + (width_pt / 2 + padding_pt) y1 = center_y + padding_pt capture_rect = pymupdf.Rect(x0, y0, x1, y1) pix = page.get_pixmap(clip=capture_rect, dpi=300) image_name = f"{fig_label}.png" image_path = os.path.join(output_dir, image_name) pix.save(image_path) print(f"Extracted {image_name}") figures_extracted += 1 return figures_extracted
# TODO: This system is flawed, since figures from the next pdf in the # series will overwrite some or all of the figures from the pdf preceding it.
[docs] def extract_all_figures(target_dir: str): """Identifies all pdfs in **target_dir** and extracts all figures from that pdf. Args: target_dir (str): The directory containing the pdfs. """ for filename in os.listdir(target_dir): if filename.lower().endswith(".pdf"): pdf_path = os.path.join(target_dir, filename) extract_figures_with_captions(pdf_path, target_dir)
if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python extract_figures.py <directory>") sys.exit(1) extract_all_figures(sys.argv[1])