Apache PDFBox Extract Images from PDF Document
This tutorial demonstrates how to extract images from a PDF document in Java using Apache PDFBox.
Maven Dependencies
We use Apache Maven to manage our project dependencies. Make sure the following dependencies reside on the class-path.
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.8</version>
</dependency>
Apache PDFBox Extract Images from PDF Document
This application extracts images from a PDF document. We loop over each page and get all the resources. Next we iterate over each object and filter out all the images.
package com.memorynotfound.pdf.pdfbox;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.io.File;
import java.io.IOException;
public class ExtractImages {
private static final String OUTPUT_DIR = "/tmp/";
public static void main(String[] args) throws Exception{
try (final PDDocument document = PDDocument.load(new File("/tmp/merged.pdf"))){
PDPageTree list = document.getPages();
for (PDPage page : list) {
PDResources pdResources = page.getResources();
int i = 1;
for (COSName name : pdResources.getXObjectNames()) {
PDXObject o = pdResources.getXObject(name);
if (o instanceof PDImageXObject) {
PDImageXObject image = (PDImageXObject)o;
String filename = OUTPUT_DIR + "extracted-image-" + i + ".png";
ImageIO.write(image.getImage(), "png", new File(filename));
i++;
}
}
}
} catch (IOException e){
System.err.println("Exception while trying to create pdf document - " + e);
}
}
}