package com.product.lucene.util; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStream; import java.util.List; import com.product.common.lang.StringUtils; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.ss.usermodel.CellType; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; public class FileUtils { /** * 文件转string * * @param file * @return */ public static String FileToString(File file) { String file_content = ""; String fileType = "unknown"; //获取文件后缀 保留.号 String fileSuffix = file.getName().substring(file.getName().lastIndexOf(".")); if (StringUtils.equalsAny(fileSuffix, ".doc", ".docx", ".xls", ".xlsx")) { //读取文件 fileType = checkDocType(file); } try { if ((file != null) && (file.isFile())) { if ("doc".equals(fileType)) { file_content = docToString(file); } else if ("docx".equals(fileType)) { file_content = docxToString(file); } else if ("xls".equals(fileType)) { file_content = readXls(file); } else if ("xlsx".equals(fileType)) { file_content = readXlsx(file); } else if (file.getName().toLowerCase().endsWith(".txt")) { file_content = txtToString(file); } } } catch (Exception e) { e.printStackTrace(); } return file_content; } public static String checkDocType(File file) { try (FileInputStream fis = new FileInputStream(file)) { byte[] bytes = new byte[8]; fis.read(bytes, 0, 8); String hex = bytesToHex(bytes); if (hex.contains("504B0304") && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) { return "docx"; } else if (hex.contains("D0CF11E0") && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) { //因为doc文件的头部也是D0CF11E0,所以需要判断文件后缀 return "doc"; } //增加xls 和 xlsx的判断 else if (hex.contains("504B0304") && (file.getName().endsWith(".xls") || file.getName().endsWith(".xlsx"))) { return "xlsx"; } else if (hex.contains("D0CF11E0") && (file.getName().endsWith(".xls") || file.getName().endsWith(".xlsx"))) { //因为xls文件的头部也是D0CF11E0,所以需要判断文件后缀 return "xls"; } else { return "unknown"; } } catch ( IOException e) { e.printStackTrace(); return "unknown"; } } private static String bytesToHex(byte[] bytes) { StringBuilder hex = new StringBuilder(); for (byte b : bytes) { hex.append(String.format("%02X", b)); } return hex.toString(); } /** * txt文件读取 * * @param file * @return */ public static String txtToString(File file) throws IOException { String result = ""; try (BufferedReader br = new BufferedReader(new FileReader(file))) {// 构造一个BufferedReader类来读取文件 String s = null; // 使用readLine方法,一次读一行 while ((s = br.readLine()) != null) { result = result + "\n" + s; } } catch (Exception e) { throw e; } return result; } /** * doc转文本 * * @param file * @return */ public static String docToString(File file) throws IOException { String result = ""; try ( FileInputStream fileInputStream = new FileInputStream(file); HWPFDocument doc = new HWPFDocument(fileInputStream); ) { Range rang = doc.getRange(); result += rang.text(); } catch (Exception e) { throw e; } return result; } /** * docx转文本 * * @param file * @return * @throws IOException */ public static String docxToString(File file) throws IOException { StringBuffer s = new StringBuffer(); try ( InputStream inputStream = new FileInputStream(file); XWPFDocument doc = new XWPFDocument(inputStream); ) { List paras = doc.getParagraphs(); for (XWPFParagraph para : paras) { // 当前段落的属性 // CTPPr pr = para.getCTP().getPPr(); s.append(para.getText()); } // 获取文档中所有的表格 List tables = doc.getTables(); List rows; List cells; for (XWPFTable table : tables) { // 表格属性 // CTTblPr pr = table.getCTTbl().getTblPr(); // 获取表格对应的行 rows = table.getRows(); for (XWPFTableRow row : rows) { // 获取行对应的单元格 cells = row.getTableCells(); for (XWPFTableCell cell : cells) { s.append(cell.getText()); } } } } catch (Exception e) { throw e; } return s.toString(); } /** * `xls转字符 * * @param file * @return * @throws IOException */ public static String readXls(File file) throws IOException { StringBuffer info = new StringBuffer(); try ( InputStream inputStream = new FileInputStream(file); HSSFWorkbook hssfWorkbook = new HSSFWorkbook(inputStream); ) { // Read the Sheet for (int numSheet = 0; numSheet < hssfWorkbook.getNumberOfSheets(); numSheet++) { HSSFSheet hssfSheet = hssfWorkbook.getSheetAt(numSheet); if (hssfSheet == null) { continue; } // Read the Row for (int rowNum = 0; rowNum <= hssfSheet.getLastRowNum(); rowNum++) { HSSFRow hssfRow = hssfSheet.getRow(rowNum); if (hssfRow != null) { int cols = hssfRow.getLastCellNum(); for (int j = 0; j <= cols; j++) { HSSFCell no = hssfRow.getCell(j); info.append(getValue(no)).append(" "); } } } } } catch (Exception e) { throw e; } return info.toString(); } /** * xlsx转字符 * * @param file * @return * @throws IOException */ public static String readXlsx(File file) throws IOException { StringBuffer info = new StringBuffer(); try ( InputStream is = new FileInputStream(file); XSSFWorkbook xssfWorkbook = new XSSFWorkbook(is); ) { for (int numSheet = 0; numSheet < xssfWorkbook.getNumberOfSheets(); numSheet++) { XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(numSheet); if (xssfSheet == null) { continue; } // Read the Row for (int rowNum = 0; rowNum <= xssfSheet.getLastRowNum(); rowNum++) { XSSFRow xssfRow = xssfSheet.getRow(rowNum); if (xssfRow != null) { int tdLength = xssfRow.getLastCellNum(); for (int j = 0; j <= tdLength; j++) { XSSFCell no = xssfRow.getCell(j); info.append(getValue(no)).append(" "); } } } } } catch (Exception e) { throw e; } return info.toString(); } /** * 获取xls单元格值 * * @param hssfCell * @return */ private static String getValue(HSSFCell hssfCell) { if (hssfCell == null) { return ""; } CellType cellType = hssfCell.getCellType(); //如果不是字符类型则返回空 if (!CellType.BOOLEAN.equals(cellType) && !CellType.NUMERIC.equals(cellType) && !CellType.STRING.equals(cellType)) { return ""; } try { if (CellType.BOOLEAN.equals(cellType)) { return String.valueOf(hssfCell.getBooleanCellValue()); } else if (CellType.NUMERIC.equals(cellType)) { return String.valueOf(hssfCell.getNumericCellValue()); } else { return String.valueOf(hssfCell.getStringCellValue()); } } catch (Exception e) { throw e; } } /** * 获取xlxs单元格值 * * @param xssfRow * @return */ private static String getValue(XSSFCell xssfRow) { if (xssfRow == null) return ""; CellType cellType = xssfRow.getCellType(); //如果不是字符类型则返回空 if (!CellType.BOOLEAN.equals(cellType) && !CellType.NUMERIC.equals(cellType) && !CellType.STRING.equals(cellType)) { return ""; } if (CellType.BOOLEAN.equals(cellType)) { return String.valueOf(xssfRow.getBooleanCellValue()); } else if (CellType.NUMERIC.equals(cellType)) { return String.valueOf(xssfRow.getNumericCellValue()); } else { return String.valueOf(xssfRow.getStringCellValue()); } } public static void main(String[] args) throws Exception { File f1 = new File("D://ceshi/a.txt"); File f2 = new File("D://ceshi/b.doc"); File f3 = new File("D://ceshi/c.docx"); File f4 = new File("D://ceshi/d.xls"); File f5 = new File("D://ceshi/e.xlsx"); System.out.println(txtToString(f1)); System.out.println("+++++++++++++++++++++++++++++++++++++++++++"); System.out.println(docToString(f2)); System.out.println("+++++++++++++++++++++++++++++++++++++++++++"); System.out.println(docxToString(f3)); System.out.println("+++++++++++++++++++++++++++++++++++++++++++"); System.out.println(readXls(f4)); System.out.println("+++++++++++++++++++++++++++++++++++++++++++"); System.out.println(readXlsx(f5)); } }