下载地址:http://pdfbox.apache.org/downloads.html
下载所需jar包如下:
bcprov-jdk16-140.jar
commons-logging-1.1.3.jar
fontbox-1.8.7.jar
pdfbox-1.8.7.jar
代码实现如下:
package com.util; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.util.PDFTextStripper; /** * @author www.yoodb.com */ public class PdfParser { /** * 文件名集合 */ private static Map<String,String> fileNames = new HashMap<String, String>(); /** * 文件夹地址 */ private static String path = "D:\\Test\\"; /** * */ public static void main(String[] args) throws Exception { getFile(path); FileInputStream fis = null; BufferedWriter writer = null; for(Entry<String, String> entry: fileNames.entrySet()) { fis = new FileInputStream(path + entry.getValue()); writer = new BufferedWriter(new FileWriter(path + entry.getKey())); PDFParser p = new PDFParser(fis); p.parse(); PDFTextStripper ts = new PDFTextStripper(); String ss = ts.getText(p.getPDDocument()); writer.write(ss); fis.close(); writer.close(); } } private static void getFile(String path){ File file = new File(path); File[] array = file.listFiles(); for(int i=0;i<array.length;i++){ if(array[i].isFile()){ String hz = array[i].getName().substring(array[i].getName().lastIndexOf("."),array[i].getName().length()); if(hz.equals(".pdf")){ String mz = array[i].getName().substring(0,array[i].getName().lastIndexOf(".")); fileNames.put(mz+".txt", array[i].getName()); } }else if(array[i].isDirectory()){ getFile(array[i].getPath()); } } } }
如果报如下异常错误:
Exception in thread "main" java.lang.NoClassDefFoundError: org/bouncycastle/jce/provider/BouncyCastleProvider at org.apache.pdfbox.pdmodel.PDDocument.openProtection(PDDocument.java:1594) at org.apache.pdfbox.pdmodel.PDDocument.decrypt(PDDocument.java:942) at org.apache.pdfbox.util.PDFTextStripper.writeText(PDFTextStripper.java:337) at org.apache.pdfbox.util.PDFTextStripper.getText(PDFTextStripper.java:257) at com.hkm.TankWar.pdf2.getText(pdf2.java:18) at com.hkm.TankWar.pdf2.main(pdf2.java:67) Caused by: java.lang.ClassNotFoundException: org.bouncycastle.jce.provider.BouncyCastleProvider at java.net.URLClassLoader$1.run(URLClassLoader.java:366) at java.net.URLClassLoader$1.run(URLClassLoader.java:355) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(URLClassLoader.java:354) at java.lang.ClassLoader.loadClass(ClassLoader.java:425) at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:308) at java.lang.ClassLoader.loadClass(ClassLoader.java:358) ... 6 more
解决方案:
BouncyCastle可以从www.bouncycastle.org下载(对应JDK版本的BouncyCastle)
1、将下载的bcprov-jdk16-140.jar包放在F:\tools\Java\jdk1.7.0_51\jre\lib\ext目录下;
2、打开F:\tools\Java\jdk1.7.0_51\jre\lib\security目录下的java.security文件,在# List of providers and their preference orders (see above):下面添加:
security.provider.x=org.bouncycastle.jce.provider.BouncyCastleProvider
(上述是网上解决方法并且不进行第二步配置操作也可解决异常问题,但个人解决方式是直接把jar当成普通包使用也解决了异常问题,不用其他配置)