java：使用 Apache PDFBox 读取 pdf 内容

使用 Apache 的 PDFBox 解析 PDF 文档，目前已经发展到 3.0 版本，和 2.x 有很大的区别，迁移说明请参阅：https://pdfbox.apache.org/3.0/migration.html

引入依赖：

pom.xml

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>3.0.4</version>
</dependency>

编码：

import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBufferedFile;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

@Test
public void pdf_content_parser_test() throws IOException {
    File file = new File("/path/to/xxx.pdf");
    try (
        PDDocument document = Loader.loadPDF(new RandomAccessReadBufferedFile(file));
    ){
        PDFTextStripper stripper = new PDFTextStripper();
        String text = stripper.getText(document);

        System.out.println(text);
    }
}