#!/usr/bin/python

# Name: extract-PDF-image
# Version: 1.0
# Created: April 19, 2019
# Last modified: April 19, 2019
# Purpose: Extract images from a PDF file; all images are extracted as PNG
# files. The file names produced will have the image dimensions as the
# last part of the name. E.g., img0-11_150x109.png would be a PNG file
# 150 pixels wide and 109 pixels high. If no images are found in the file,
# no output will be produced.
# Description: http://suport.moonpoint.com/languages/python/extract-PDF-image.php

import fitz, os, sys

try:
   sys.argv[1]
except IndexError:
   print "Error - missing input file name! Usage ./extract-PDF-image file.pdf"
   sys.exit(1)
else:
   if os.path.isfile(sys.argv[1]):
      doc = fitz.open(sys.argv[1])
   else:
      print sys.argv[1], "not found"
      sys.exit(1)

# For documentation see
# https://pymupdf.readthedocs.io/en/latest/document/
for i in range(len(doc)):
    for img in doc.getPageImageList(i):
        xref = img[0]
        width = img[2]
        height = img[3]
        pix = fitz.Pixmap(doc, xref)
        if pix.n - pix.alpha < 4:   # this is GRAY or RGB; can be saved as PNG
            pix.writePNG("img%s-%s_%sx%s.png" % (i, xref, width, height))
        else:               # CMYK: convert to RGB first
            pix1 = fitz.Pixmap(fitz.csRGB, pix)
            pix1.writePNG("img%s-%s_%sx%s.png" % (i, xref, width, height))
            pix1 = None   # free Pixmap resources
        pix = None        # free Pixmap resources