Source code for maeser.admin_portal.extract_figures

# SPDX-License-Identifier: LGPL-3.0-or-later

"""
This module is used to extract figures from a series of pdfs in a directory
It does this by looking for "Figure X.Y" in the text and extracting a screenshot of the page
cropped around this text.
This procedure is *very* rudimentary and could use much improvement.
This module can be executed in the terminal or used within another script.
"""

import sys
import os
import re
import pymupdf


[docs]
def extract_figures_with_captions(pdf_path, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    doc = pymupdf.open(pdf_path)
    figures_extracted = 0

    # Desired size in points (600x400 px @ 300 dpi)
    width_pt = 600  # 600px
    height_pt = 400  # 400px
    padding_pt = 16

    for page_index in range(len(doc)):
        page = doc[page_index]
        text_blocks = page.get_text("dict")["blocks"]

        for block in text_blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"]
                    match = re.match(r"(Figure\s+(\d+\.\d+))", text, re.IGNORECASE)
                    if match:
                        fig_label = match.group(2)
                        caption_rect = pymupdf.Rect(span["bbox"])
                        center_x = (caption_rect.x0 + caption_rect.x1) / 2
                        center_y = (caption_rect.y0 + caption_rect.y1) / 2

                        # Define fixed-size capture rectangle centered on the caption
                        x0 = center_x - (width_pt / 2 + padding_pt)
                        y0 = center_y - (height_pt + padding_pt)
                        x1 = center_x + (width_pt / 2 + padding_pt)
                        y1 = center_y + padding_pt
                        capture_rect = pymupdf.Rect(x0, y0, x1, y1)

                        pix = page.get_pixmap(clip=capture_rect, dpi=300)
                        image_name = f"{fig_label}.png"
                        image_path = os.path.join(output_dir, image_name)
                        pix.save(image_path)
                        print(f"Extracted {image_name}")
                        figures_extracted += 1
    return figures_extracted


# TODO: This system is flawed, since figures from the next pdf in the
# series will overwrite some or all of the figures from the pdf preceding it.

[docs]
def extract_all_figures(target_dir: str):
    """Identifies all pdfs in **target_dir** and extracts all figures from that pdf.

    Args:
        target_dir (str): The directory containing the pdfs.
    """
    for filename in os.listdir(target_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(target_dir, filename)
            extract_figures_with_captions(pdf_path, target_dir)


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python extract_figures.py <directory>")
        sys.exit(1)
    extract_all_figures(sys.argv[1])