Edit PDF to remove all non images (python, node, c#)
New here? Learn about Bountify and follow @bountify to get notified of new bounties! x

We would like to get a POC made of a process that can remove all non images from a pdf and then output the pdf with just images in it in their original position. this means removing text, tables, characters etc..

We are open to python, node, c#, java, or GO please use https://replit.com/ for your demo. we should be able to give it a PDF like this one

https://www.fema.gov/sites/default/files/2020-05/fim_appendix-c-lowest-floor-guide_apr2020.pdf

Lets make sure SVG remains as as an image would.

8 days ago

Crowdsource coding tasks.

1 Solution


The thing about processing PDF is that it can be a slow process and depending on the application with which the PDF was created the structure can change and therefore fail, but here is my solution which works with the provided PDF:

https://replit.com/@CarlosOlivo/pdfimages

Repl.it usually kills long processes, so it is best to try it locally

Requirements:

> Python 3.8+
$ pip3 install pikepdf

Usage - Images & Figures (SVG)

$ python pdf_images.py fim_appendix-c-lowest-floor-guide_apr2020.pdf output.pdf

Usage - Images only

$ python pdf_images.py fim_appendix-c-lowest-floor-guide_apr2020.pdf output.pdf --images

Usage:

usage: pdf_images.py [-h] [--password [PASSWORD]] [--linearize] [--images] source_file [target_file]

Script that tries to remove all non-graphic data from a PDF file.

positional arguments:
  source_file           Source PDF file.
  target_file           Target PDF file. If a file exists in this location it will be overwritten. (Defaults to the source PDF file if not specified - slow, not recommended)

optional arguments:
  -h, --help            show this help message and exit
  --password [PASSWORD] Password to process an encrypted PDF file.
  --linearize           Enables creating linear or fast web view, where the files contents are organized  sequentially.
  --images              Preserve only images.

pdf_images.py

import argparse
import pikepdf

def is_figure(operands):
    for operand in operands:
        if isinstance(operand, pikepdf.Name):
            if str(operand) == "/Figure":
                return True
    return False

def is_image(operands, page):
    for operand in operands:
        if isinstance(operand, pikepdf.Name):
            return page.Resources.XObject[operand].get("/Subtype") == "/Image"
    return False

def is_operator(stream_object, operator):
    _, op = stream_object
    if op == pikepdf.Operator(operator):
        return True
    return False

def find_graphics_state(content_stream, subrange):
    start, end = subrange
    count = 0
    while end+1 < len(content_stream) and is_operator(content_stream[end+1], "Q"):
        end += 1
        if count == 0:
            start = end
        count += 1
    while count > 0:
        start -= 1
        _, op = content_stream[start]
        if str(op) in ["q", "BMC", "BDC"]:
            count -= 1
            continue
        if str(op) in ["Q", "EMC"]:
            count += 1
    return (start, end)

def remove_artifacts(content_stream, page):
    objects = []
    count = 0
    index_figure = 0
    pos_k = None
    for index, object_stream in enumerate(content_stream):
        operands, operator = object_stream
        if str(operator) in ["BMC", "BDC"]:
            if count == 0 and is_figure(operands):
                if pos_k is not None:
                    objects.append((pos_k, pos_k))
                    pos_k = None
                index_figure = index
                count += 1
                continue
            if count > 0:
                count += 1
            continue
        if str(operator) == "EMC" and count > 0:
            count -= 1
            if count == 0:
                objects.append((index_figure, index))
            continue
        if str(operator) == "k" and count == 0:
            pos_k = index
            continue
        if str(operator) == "Do" and count == 0 and is_image(operands, page):
            objects.append((index, index))
    objects = [find_graphics_state(content_stream, (start, end)) for (start, end) in objects]
    objects = [content_stream[start:end+1] for (start, end) in objects]
    print(f"- Keeping {len(objects)} elements", end=" ", flush=True)
    new_content_stream = []
    for object_stream in objects:
        new_content_stream.extend(object_stream)
    return new_content_stream

def allowed_operands(args):
    general_graphics_state = "w J j M d ri i gs"
    special_graphics_state = "q Q cm"
    path_operators = "m l c v y h re S s f F f* B B* b b* n W W* sh"
    color_operators = "CS cs SC SCN sc scn G g RG rg K k"
    image_operators = "BI ID EI Do"
    marked_content = "MP DP BMC BDC EMC"
    if args.images:
        return f"{special_graphics_state} {image_operators}"
    return f"{general_graphics_state} {special_graphics_state} {path_operators} {color_operators} {image_operators} {marked_content}"

def progress(percent):
    print(f"Saving file - {0}% ...", end="\r", flush=True)

def dump(content_stream):
    with open("test.txt", 'w', encoding="utf-8") as f:
        for operands, operator in content_stream:
            f.write(f"Operands {operands}, operator {operator}\n")

def main(args):
    try:
        with pikepdf.open(args.source_file, password=args.password, allow_overwriting_input=args.target_file is None) as pdf:
            print(f"Found {args.source_file} [v{pdf.pdf_version}] with {len(pdf.pages)} pages", flush=True)
            for count, page in enumerate(pdf.pages, start=1):
                print(f"Processing page {count}", end=" ", flush=True)
                content_stream = pikepdf.parse_content_stream(page, allowed_operands(args))
                #print(dict(page.Resources).items())
                print(f"- Found {len(content_stream)} objects", end=" ", flush=True)
                #dump(content_stream)
                content_stream = remove_artifacts(content_stream, page)
                #dump(content_stream)
                new_content_stream = pikepdf.unparse_content_stream(content_stream)
                page.Contents = pdf.make_stream(new_content_stream)
                print("- Done", flush=True)
            print(f"Removing unreferenced resources", end=" ", flush=True)
            pdf.remove_unreferenced_resources()
            print("- Done", flush=True)
            pdf.save(args.target_file, linearize=args.linearize, progress=progress, encryption=pdf.is_encrypted)
            print("Saving file - 100% - Done", end="", flush=True)
    except pikepdf._qpdf.PasswordError as e:
        print(f"Invalid password: {e}")
    except (pikepdf._qpdf.PdfError, TypeError) as e:
        print(f"Invalid file: {e}")
    except FileNotFoundError as e:
        print(f"File not found: {e}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Script that tries to remove all non-graphic data from a PDF file.")
    parser.add_argument("source_file", type=str, help="Source PDF file.")
    parser.add_argument("target_file", type=str, help="Target PDF file. If a file exists in this location it will be overwritten. (Defaults to the source PDF file if not specified - slow, not recommended)", nargs='?', default=None)
    parser.add_argument("--password", type=str, help="Password to process an encrypted PDF file.", nargs='?', default="")
    parser.add_argument("--linearize", action="store_true", help="Enables creating linear or “fast web view”, where the file’s contents are organized sequentially.", default=False)
    parser.add_argument("--images", action="store_true", help="Preserve only images.", default=False)
    args = parser.parse_args()
    main(args)
Testing! Will report back shortly. Thx in advance
Qdev 6 days ago
Sure, I've noticed that Repl.it tends to corrupt large PDFs, make sure to test it locally as well.
Carlos Olivo 3 days ago