poi解析office文档内容的工具类,poi处理word文档

摘要：

第一步引入依赖org.apache.pdfboxpdfbox2.0.18org.springframework.bootspring-boot-starter-data-elasticsearch第二部创建解析文档的工具类ReadFileConverterpackagecom.atguigu.servicees.util;importorg.apache.commons.io.FileUtils;importorg.apache.pdfbox.pdmodel.PDDocument;importorg.apache.pdfbox.text.PDFTextStripper;importorg.apache.poi.POIXMLDocument;importorg.apache.poi.hssf.usermodel.HSSFCell;importorg.apache.poi.hssf.usermodel.HSSFRow;importorg.apache.poi.hssf.usermodel.HSSFSheet;importorg.apache.poi.hssf.usermodel.HSSFWorkbook;importorg.apache.poi.hwpf.extractor.WordExtractor;importorg.apache.poi.ss.usermodel.Cell;importorg.apache.poi.xssf.usermodel.XSSFCell;importorg.apache.poi.xssf.usermodel.XSSFRow;importorg.apache.poi.xssf.usermodel.XSSFSheet;importorg.apache.poi.xssf.usermodel.XSSFWorkbook;importorg.apache.poi.xwpf.extractor.XWPFWordExtractor;importjava.io.BufferedInputStream;importjava.io.File;importjava.io.FileInputStream;importjava.io.IOException;importjava.text.NumberFormat;/***文件内容读取转换器*/publicclassReadFileConverter{publicStringgetContentsthrowsException{Stringcontents="";intindex=path.lastIndexOf(".");Stringfile_suffix=path.substring.toLowerCase();if{contents=this.readTXT;}elseif{contents=this.readXLS;}elseif{contents=this.readXLSX;}elseif{contents=this.readDOC;}elseif{contents=this.readDOCX;}elseif{contents=this.readPDF;}returncontents;}/***解析xls文件内容*@paramfile*@return*@throwsException*/publicStringreadXLSthrowsException{StringBuildercontent=newStringBuilder();HSSFWorkbookworkbook=newHSSFWorkbook;try{for{if(null!

第一步引入依赖

       <!--xls-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi</artifactId>
            <version>3.17</version>
        </dependency>

        <!--xlsx-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-ooxml</artifactId>
            <version>3.17</version>
        </dependency>

        <!--word-->
        <dependency>
            <groupId>org.apache.poi</groupId>
            <artifactId>poi-scratchpad</artifactId>
           <version>3.17</version>
        </dependency>

        <!--pdf-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.18</version>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>

第二部创建解析文档的工具类ReadFileConverter

packagecom.atguigu.servicees.util;

importorg.apache.commons.io.FileUtils;
importorg.apache.pdfbox.pdmodel.PDDocument;
importorg.apache.pdfbox.text.PDFTextStripper;
importorg.apache.poi.POIXMLDocument;
importorg.apache.poi.hssf.usermodel.HSSFCell;
importorg.apache.poi.hssf.usermodel.HSSFRow;
importorg.apache.poi.hssf.usermodel.HSSFSheet;
importorg.apache.poi.hssf.usermodel.HSSFWorkbook;
importorg.apache.poi.hwpf.extractor.WordExtractor;
importorg.apache.poi.ss.usermodel.Cell;
importorg.apache.poi.xssf.usermodel.XSSFCell;
importorg.apache.poi.xssf.usermodel.XSSFRow;
importorg.apache.poi.xssf.usermodel.XSSFSheet;
importorg.apache.poi.xssf.usermodel.XSSFWorkbook;
importorg.apache.poi.xwpf.extractor.XWPFWordExtractor;
importjava.io.BufferedInputStream;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.IOException;
importjava.text.NumberFormat;

/*** 文件内容读取转换器
 */
public classReadFileConverter {

    public String getContents(String path) throwsException {
        String contents = "";
        int index = path.lastIndexOf(".");
        String file_suffix = path.substring(index + 1).toLowerCase();
        if (file_suffix.equalsIgnoreCase("txt") || file_suffix.equalsIgnoreCase("log")) {
            contents = this.readTXT(path);
        } else if (file_suffix.equalsIgnoreCase("xls")) {
            contents = this.readXLS(path);
        } else if (file_suffix.equalsIgnoreCase("xlsx")) {
            contents = this.readXLSX(path);
        } else if (file_suffix.equalsIgnoreCase("doc")) {
            contents = this.readDOC(path);
        } else if (file_suffix.equalsIgnoreCase("docx")) {
            contents = this.readDOCX(path);
        } else if (file_suffix.equalsIgnoreCase("pdf")) {
            contents = this.readPDF(path);
        }
        returncontents;
    }


    /*** 解析xls文件内容
     * @paramfile
     * @return* @throwsException
     */
    public String readXLS(String file) throwsException {
        StringBuilder content = newStringBuilder();
        HSSFWorkbook workbook = new HSSFWorkbook(newFileInputStream(file));
        try{
            for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                if (null !=workbook.getSheetAt(numSheets)) {
                    HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet
                    for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                        if (null !=aSheet.getRow(rowNumOfSheet)) {
                            HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行
                            for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                                if (null !=aRow.getCell(cellNumOfRow)) {
                                    HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值
                                    if (this.convertCell(aCell).length() > 0) {
                                        content.append(this.convertCell(aCell));
                                    }
                                }
                                content.append("
");
                            }
                        }
                    }
                }
            }
        } catch(Exception e) {
            content.append("xls文件格式不对或损坏");
        } finally{
            if (workbook != null) {
                workbook.close();
            }
        }
        returncontent.toString();
    }


    /*** 解析xlsx文件内容
     * @paramfile
     * @return* @throwsException
     */
    public String readXLSX(String file) throwsException {
        StringBuilder content = newStringBuilder();
        XSSFWorkbook workbook = newXSSFWorkbook(file);
        try{
            for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) {
                if (null !=workbook.getSheetAt(numSheets)) {
                    XSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet
                    for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) {
                        if (null !=aSheet.getRow(rowNumOfSheet)) {
                            XSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行
                            for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) {
                                if (null !=aRow.getCell(cellNumOfRow)) {
                                    XSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值
                                    if (this.convertCell(aCell).length() > 0) {
                                        content.append(this.convertCell(aCell));
                                    }
                                }
                                content.append("
");
                            }
                        }
                    }
                }
            }
        } catch(Exception e) {
            content.append("xlsx文件格式不对或损坏");
        } finally{
            if (workbook != null) {
                workbook.close();
            }
        }
        returncontent.toString();
    }


    /*** 解析txt文件内容
     * @paramfile
     * @return* @throwsException
     */
    public String readTXT(String file) throwsException {
        String contents = "";
        try{
            String encoding = this.get_charset(newFile(file));
            if (encoding.equalsIgnoreCase("GBK")) {
                contents = FileUtils.readFileToString(new File(file), "gbk");
            } else{
                contents = FileUtils.readFileToString(new File(file), "utf8");
            }
        } catch(Exception e) {
            contents = "txt文件格式不对或损坏";
        }
        returncontents;
    }

    /*** 解析doc文件内容
     * @paramfile
     * @return* @throwsException
     */
    public String readDOC(String file) throwsException {
        String returnStr;
        FileInputStream inputStream = new FileInputStream(newFile(file));
        WordExtractor wordExtractor = newWordExtractor(inputStream);
        try{
            returnStr =wordExtractor.getText();
        } catch(Exception e) {
            returnStr = "doc文件格式不对或损坏";
        } finally{
            if (inputStream != null) {
                inputStream.close();
            }
        }
        returnreturnStr;
    }


    /*** 解析docx文件内容
     * @paramfile
     * @return* @throwsException
     */
    public String readDOCX(String file) throwsException {
        String docx;
        XWPFWordExtractor xwp = newXWPFWordExtractor(POIXMLDocument.openPackage(file));
        try{
            docx =xwp.getText();
        } catch(Exception e) {
            docx = "docx文件格式不对或损坏";
        } finally{
            if (xwp != null) {
                xwp.close();
            }
        }
        returndocx;
    }


    /*** 解析pdf文件内容
     * @paramfile
     * @return* @throwsException
     */
    public String readPDF(String file) throwsException {
        String result = null;
        FileInputStream is = null;
        PDDocument document = null;
        try{
            is = newFileInputStream(file);
            document =PDDocument.load(is);
            PDFTextStripper stripper = newPDFTextStripper();
            result =stripper.getText(document);
        } catch(Exception e) {
            result = "pdf文件格式不对或损坏";
        } finally{
            if (is != null) {
                is.close();
            }
            if (document != null) {
                document.close();
            }
        }
        returnresult;
    }

    private String get_charset(File file) throwsIOException {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        BufferedInputStream bis = null;
        try{
            boolean checked = false;
            bis = new BufferedInputStream(newFileInputStream(file));
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1)
                returncharset;
            if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE";
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8";
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc = loc + 1;
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) //单独出现BF以下的，也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read =bis.read();
                        if (0x80 <= read && read <= 0xBF) //双字节 (0xC0 - 0xDF)
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {//也有可能出错，但是几率较小
                        read =bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read =bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
        } catch(Exception e) {
            e.printStackTrace();
        } finally{
            if (bis != null) {
                bis.close();
            }
        }
        returncharset;
    }

    /*** 解析excel需要使用的工具类
     * @paramcell
     * @return
     */
    privateString convertCell(Cell cell) {
        NumberFormat formater =NumberFormat.getInstance();
        formater.setGroupingUsed(false);
        String cellValue = "";
        if (cell == null) {
            returncellValue;
        }
        switch(cell.getCellTypeEnum()) {
            caseNUMERIC:
                cellValue =formater.format(cell.getNumericCellValue());
                break;
            caseSTRING:
                cellValue =cell.getStringCellValue();
                break;
            caseBLANK:
                cellValue =cell.getStringCellValue();
                break;
            caseBOOLEAN:
                cellValue =Boolean.valueOf(cell.getBooleanCellValue()).toString();
                break;
            caseERROR:
                cellValue =String.valueOf(cell.getErrorCellValue());
                break;
            default:
                cellValue = "";
        }
        returncellValue.trim();
    }

}

poi解析office文档内容的工具类

相关文章

微信支付（公众号支付JSAPI）--转载

Prism完成的一个WPF项目

DataTable导出到Excel

ASP.NET Core 中间件（Middleware)（一）

Quartz.Net系列（五）：Quartz五大构件Job之JobBuilder解析

delphi类型转换 asci与char

最新文章

随机推荐

思享工具箱导航

JSON工具

格式化转换

加解密编码

文本数字

网络

站长

计算

其他

对照列表