第一步引入依赖
<!--xls--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> <version>3.17</version> </dependency> <!--xlsx--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-ooxml</artifactId> <version>3.17</version> </dependency> <!--word--> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi-scratchpad</artifactId> <version>3.17</version> </dependency> <!--pdf--> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>2.0.18</version> </dependency> <dependency> <groupId>org.springframework.boot</groupId> <artifactId>spring-boot-starter-data-elasticsearch</artifactId> </dependency>
第二部创建解析文档的工具类ReadFileConverter
packagecom.atguigu.servicees.util; importorg.apache.commons.io.FileUtils; importorg.apache.pdfbox.pdmodel.PDDocument; importorg.apache.pdfbox.text.PDFTextStripper; importorg.apache.poi.POIXMLDocument; importorg.apache.poi.hssf.usermodel.HSSFCell; importorg.apache.poi.hssf.usermodel.HSSFRow; importorg.apache.poi.hssf.usermodel.HSSFSheet; importorg.apache.poi.hssf.usermodel.HSSFWorkbook; importorg.apache.poi.hwpf.extractor.WordExtractor; importorg.apache.poi.ss.usermodel.Cell; importorg.apache.poi.xssf.usermodel.XSSFCell; importorg.apache.poi.xssf.usermodel.XSSFRow; importorg.apache.poi.xssf.usermodel.XSSFSheet; importorg.apache.poi.xssf.usermodel.XSSFWorkbook; importorg.apache.poi.xwpf.extractor.XWPFWordExtractor; importjava.io.BufferedInputStream; importjava.io.File; importjava.io.FileInputStream; importjava.io.IOException; importjava.text.NumberFormat; /*** 文件内容读取转换器 */ public classReadFileConverter { public String getContents(String path) throwsException { String contents = ""; int index = path.lastIndexOf("."); String file_suffix = path.substring(index + 1).toLowerCase(); if (file_suffix.equalsIgnoreCase("txt") || file_suffix.equalsIgnoreCase("log")) { contents = this.readTXT(path); } else if (file_suffix.equalsIgnoreCase("xls")) { contents = this.readXLS(path); } else if (file_suffix.equalsIgnoreCase("xlsx")) { contents = this.readXLSX(path); } else if (file_suffix.equalsIgnoreCase("doc")) { contents = this.readDOC(path); } else if (file_suffix.equalsIgnoreCase("docx")) { contents = this.readDOCX(path); } else if (file_suffix.equalsIgnoreCase("pdf")) { contents = this.readPDF(path); } returncontents; } /*** 解析xls文件内容 * @paramfile * @return* @throwsException */ public String readXLS(String file) throwsException { StringBuilder content = newStringBuilder(); HSSFWorkbook workbook = new HSSFWorkbook(newFileInputStream(file)); try{ for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { if (null !=workbook.getSheetAt(numSheets)) { HSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null !=aSheet.getRow(rowNumOfSheet)) { HSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行 for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null !=aRow.getCell(cellNumOfRow)) { HSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值 if (this.convertCell(aCell).length() > 0) { content.append(this.convertCell(aCell)); } } content.append(" "); } } } } } } catch(Exception e) { content.append("xls文件格式不对或损坏"); } finally{ if (workbook != null) { workbook.close(); } } returncontent.toString(); } /*** 解析xlsx文件内容 * @paramfile * @return* @throwsException */ public String readXLSX(String file) throwsException { StringBuilder content = newStringBuilder(); XSSFWorkbook workbook = newXSSFWorkbook(file); try{ for (int numSheets = 0; numSheets < workbook.getNumberOfSheets(); numSheets++) { if (null !=workbook.getSheetAt(numSheets)) { XSSFSheet aSheet = workbook.getSheetAt(numSheets);//获得一个sheet for (int rowNumOfSheet = 0; rowNumOfSheet <= aSheet.getLastRowNum(); rowNumOfSheet++) { if (null !=aSheet.getRow(rowNumOfSheet)) { XSSFRow aRow = aSheet.getRow(rowNumOfSheet); //获得一个行 for (short cellNumOfRow = 0; cellNumOfRow <= aRow.getLastCellNum(); cellNumOfRow++) { if (null !=aRow.getCell(cellNumOfRow)) { XSSFCell aCell = aRow.getCell(cellNumOfRow);//获得列值 if (this.convertCell(aCell).length() > 0) { content.append(this.convertCell(aCell)); } } content.append(" "); } } } } } } catch(Exception e) { content.append("xlsx文件格式不对或损坏"); } finally{ if (workbook != null) { workbook.close(); } } returncontent.toString(); } /*** 解析txt文件内容 * @paramfile * @return* @throwsException */ public String readTXT(String file) throwsException { String contents = ""; try{ String encoding = this.get_charset(newFile(file)); if (encoding.equalsIgnoreCase("GBK")) { contents = FileUtils.readFileToString(new File(file), "gbk"); } else{ contents = FileUtils.readFileToString(new File(file), "utf8"); } } catch(Exception e) { contents = "txt文件格式不对或损坏"; } returncontents; } /*** 解析doc文件内容 * @paramfile * @return* @throwsException */ public String readDOC(String file) throwsException { String returnStr; FileInputStream inputStream = new FileInputStream(newFile(file)); WordExtractor wordExtractor = newWordExtractor(inputStream); try{ returnStr =wordExtractor.getText(); } catch(Exception e) { returnStr = "doc文件格式不对或损坏"; } finally{ if (inputStream != null) { inputStream.close(); } } returnreturnStr; } /*** 解析docx文件内容 * @paramfile * @return* @throwsException */ public String readDOCX(String file) throwsException { String docx; XWPFWordExtractor xwp = newXWPFWordExtractor(POIXMLDocument.openPackage(file)); try{ docx =xwp.getText(); } catch(Exception e) { docx = "docx文件格式不对或损坏"; } finally{ if (xwp != null) { xwp.close(); } } returndocx; } /*** 解析pdf文件内容 * @paramfile * @return* @throwsException */ public String readPDF(String file) throwsException { String result = null; FileInputStream is = null; PDDocument document = null; try{ is = newFileInputStream(file); document =PDDocument.load(is); PDFTextStripper stripper = newPDFTextStripper(); result =stripper.getText(document); } catch(Exception e) { result = "pdf文件格式不对或损坏"; } finally{ if (is != null) { is.close(); } if (document != null) { document.close(); } } returnresult; } private String get_charset(File file) throwsIOException { String charset = "GBK"; byte[] first3Bytes = new byte[3]; BufferedInputStream bis = null; try{ boolean checked = false; bis = new BufferedInputStream(newFileInputStream(file)); bis.mark(0); int read = bis.read(first3Bytes, 0, 3); if (read == -1) returncharset; if (first3Bytes[0] == (byte) 0xFF && first3Bytes[1] == (byte) 0xFE) { charset = "UTF-16LE"; checked = true; } else if (first3Bytes[0] == (byte) 0xFE && first3Bytes[1] == (byte) 0xFF) { charset = "UTF-16BE"; checked = true; } else if (first3Bytes[0] == (byte) 0xEF && first3Bytes[1] == (byte) 0xBB && first3Bytes[2] == (byte) 0xBF) { charset = "UTF-8"; checked = true; } bis.reset(); if (!checked) { int loc = 0; while ((read = bis.read()) != -1) { loc = loc + 1; if (read >= 0xF0) break; if (0x80 <= read && read <= 0xBF) //单独出现BF以下的,也算是GBK break; if (0xC0 <= read && read <= 0xDF) { read =bis.read(); if (0x80 <= read && read <= 0xBF) //双字节 (0xC0 - 0xDF) continue; else break; } else if (0xE0 <= read && read <= 0xEF) {//也有可能出错,但是几率较小 read =bis.read(); if (0x80 <= read && read <= 0xBF) { read =bis.read(); if (0x80 <= read && read <= 0xBF) { charset = "UTF-8"; break; } else break; } else break; } } } } catch(Exception e) { e.printStackTrace(); } finally{ if (bis != null) { bis.close(); } } returncharset; } /*** 解析excel需要使用的工具类 * @paramcell * @return */ privateString convertCell(Cell cell) { NumberFormat formater =NumberFormat.getInstance(); formater.setGroupingUsed(false); String cellValue = ""; if (cell == null) { returncellValue; } switch(cell.getCellTypeEnum()) { caseNUMERIC: cellValue =formater.format(cell.getNumericCellValue()); break; caseSTRING: cellValue =cell.getStringCellValue(); break; caseBLANK: cellValue =cell.getStringCellValue(); break; caseBOOLEAN: cellValue =Boolean.valueOf(cell.getBooleanCellValue()).toString(); break; caseERROR: cellValue =String.valueOf(cell.getErrorCellValue()); break; default: cellValue = ""; } returncellValue.trim(); } }