#!/usr/bin/python # Name: extract-PDF-image # Version: 1.0 # Created: April 19, 2019 # Last modified: April 19, 2019 # Purpose: Extract images from a PDF file; all images are extracted as PNG # files. The file names produced will have the image dimensions as the # last part of the name. E.g., img0-11_150x109.png would be a PNG file # 150 pixels wide and 109 pixels high. If no images are found in the file, # no output will be produced. # Description: http://suport.moonpoint.com/languages/python/extract-PDF-image.php import fitz, os, sys try: sys.argv[1] except IndexError: print "Error - missing input file name! Usage ./extract-PDF-image file.pdf" sys.exit(1) else: if os.path.isfile(sys.argv[1]): doc = fitz.open(sys.argv[1]) else: print sys.argv[1], "not found" sys.exit(1) # For documentation see # https://pymupdf.readthedocs.io/en/latest/document/ for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] width = img[2] height = img[3] pix = fitz.Pixmap(doc, xref) if pix.n - pix.alpha < 4: # this is GRAY or RGB; can be saved as PNG pix.writePNG("img%s-%s_%sx%s.png" % (i, xref, width, height)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG("img%s-%s_%sx%s.png" % (i, xref, width, height)) pix1 = None # free Pixmap resources pix = None # free Pixmap resources