From b44bfff58de3adc572fd4643ecb3ec9b5586bea3 Mon Sep 17 00:00:00 2001 From: 许鹏程 <1821349743@qq.com> Date: 星期二, 28 五月 2024 16:09:03 +0800 Subject: [PATCH] commit --- src/main/java/com/product/lucene/util/FileUtils.java | 253 ++++++++++++++++++++++++++++++++----------------- 1 files changed, 164 insertions(+), 89 deletions(-) diff --git a/src/main/java/com/product/lucene/util/FileUtils.java b/src/main/java/com/product/lucene/util/FileUtils.java index be965f3..c7fd7ca 100644 --- a/src/main/java/com/product/lucene/util/FileUtils.java +++ b/src/main/java/com/product/lucene/util/FileUtils.java @@ -8,12 +8,14 @@ import java.io.InputStream; import java.util.List; +import com.product.common.lang.StringUtils; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.usermodel.Range; +import org.apache.poi.ss.usermodel.CellType; import org.apache.poi.xssf.usermodel.XSSFCell; import org.apache.poi.xssf.usermodel.XSSFRow; import org.apache.poi.xssf.usermodel.XSSFSheet; @@ -28,83 +30,139 @@ public class FileUtils { /** - * 鏂囦欢杞瑂tring + * 鏂囦欢杞瑂tring + * * @param file * @return */ public static String FileToString(File file) { - String file_content=""; - if ((file != null) && (file.isFile())) { - if (file.getName().toLowerCase().endsWith(".txt")) { - file_content = txtToString(file); - } else if ((file.getName().toLowerCase().endsWith(".doc"))){ - file_content = docToString(file); - } else if ((file.getName().toLowerCase().endsWith(".docx"))) { - file_content = docxToString(file); - } else if (file.getName().toLowerCase().endsWith(".xls")) { - file_content = readXls(file); - } else if (file.getName().toLowerCase().endsWith(".xlsx")) { - file_content = readXlsx(file); - } + String file_content = ""; + String fileType = "unknown"; + //鑾峰彇鏂囦欢鍚庣紑 淇濈暀.鍙� + String fileSuffix = file.getName().substring(file.getName().lastIndexOf(".")); + if (StringUtils.equalsAny(fileSuffix, ".doc", ".docx", ".xls", ".xlsx")) { + //璇诲彇鏂囦欢 + fileType = checkDocType(file); } - return file_content; - } - + try { + if ((file != null) && (file.isFile())) { + + + if ("doc".equals(fileType)) { + file_content = docToString(file); + } else if ("docx".equals(fileType)) { + file_content = docxToString(file); + } else if ("xls".equals(fileType)) { + file_content = readXls(file); + } else if ("xlsx".equals(fileType)) { + file_content = readXlsx(file); + } else if (file.getName().toLowerCase().endsWith(".txt")) { + file_content = txtToString(file); + } + } + } catch (Exception e) { + e.printStackTrace(); + } + return file_content; + } + + + public static String checkDocType(File file) { + try (FileInputStream fis = new FileInputStream(file)) { + byte[] bytes = new byte[8]; + fis.read(bytes, 0, 8); + + String hex = bytesToHex(bytes); + + if (hex.contains("504B0304") && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) { + return "docx"; + } else if (hex.contains("D0CF11E0") && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) { + //鍥犱负doc鏂囦欢鐨勫ご閮ㄤ篃鏄疍0CF11E0锛屾墍浠ラ渶瑕佸垽鏂枃浠跺悗缂� + return "doc"; + } + //澧炲姞xls 鍜� xlsx鐨勫垽鏂� + else if (hex.contains("504B0304") && (file.getName().endsWith(".xls") || file.getName().endsWith(".xlsx"))) { + return "xlsx"; + } else if (hex.contains("D0CF11E0") && (file.getName().endsWith(".xls") || file.getName().endsWith(".xlsx"))) { + //鍥犱负xls鏂囦欢鐨勫ご閮ㄤ篃鏄疍0CF11E0锛屾墍浠ラ渶瑕佸垽鏂枃浠跺悗缂� + return "xls"; + } else { + return "unknown"; + } + } catch ( + IOException e) { + e.printStackTrace(); + return "unknown"; + } + } + + private static String bytesToHex(byte[] bytes) { + StringBuilder hex = new StringBuilder(); + for (byte b : bytes) { + hex.append(String.format("%02X", b)); + } + return hex.toString(); + } + /** - * txt鏂囦欢璇诲彇 + * txt鏂囦欢璇诲彇 + * * @param file * @return */ - public static String txtToString(File file) { + public static String txtToString(File file) throws IOException { String result = ""; - try (BufferedReader br = new BufferedReader(new FileReader(file))){// 鏋勯�犱竴涓狟ufferedReader绫绘潵璇诲彇鏂囦欢 + try (BufferedReader br = new BufferedReader(new FileReader(file))) {// 鏋勯�犱竴涓狟ufferedReader绫绘潵璇诲彇鏂囦欢 String s = null; // 浣跨敤readLine鏂规硶锛屼竴娆¤涓�琛� while ((s = br.readLine()) != null) { result = result + "\n" + s; } } catch (Exception e) { - e.printStackTrace(); + throw e; } return result; } - + /** - * doc杞枃鏈� + * doc杞枃鏈� + * * @param file * @return */ - public static String docToString(File file) { + public static String docToString(File file) throws IOException { String result = ""; try ( - FileInputStream fileInputStream = new FileInputStream(file); - HWPFDocument doc = new HWPFDocument(fileInputStream); - ){ + FileInputStream fileInputStream = new FileInputStream(file); + HWPFDocument doc = new HWPFDocument(fileInputStream); + ) { Range rang = doc.getRange(); result += rang.text(); } catch (Exception e) { - e.printStackTrace(); + throw e; } return result; } - + /** - * docx杞枃鏈� + * docx杞枃鏈� + * * @param file * @return * @throws IOException */ - public static String docxToString(File file) { - StringBuffer s=new StringBuffer(); + public static String docxToString(File file) throws IOException { + + StringBuffer s = new StringBuffer(); try ( - InputStream inputStream = new FileInputStream(file); - XWPFDocument doc = new XWPFDocument(inputStream); - ){ + InputStream inputStream = new FileInputStream(file); + XWPFDocument doc = new XWPFDocument(inputStream); + ) { List<XWPFParagraph> paras = doc.getParagraphs(); for (XWPFParagraph para : paras) { // 褰撳墠娈佃惤鐨勫睘鎬� // CTPPr pr = para.getCTP().getPPr(); - s.append( para.getText()); + s.append(para.getText()); } // 鑾峰彇鏂囨。涓墍鏈夌殑琛ㄦ牸 List<XWPFTable> tables = doc.getTables(); @@ -119,29 +177,29 @@ // 鑾峰彇琛屽搴旂殑鍗曞厓鏍� cells = row.getTableCells(); for (XWPFTableCell cell : cells) { - s.append( cell.getText()); + s.append(cell.getText()); } } } } catch (Exception e) { - e.printStackTrace(); + throw e; } return s.toString(); } - + /** * `xls杞瓧绗� - * @param f + * + * @param file * @return * @throws IOException */ - public static String readXls(File file) { + public static String readXls(File file) throws IOException { StringBuffer info = new StringBuffer(); - try( - InputStream inputStream = new FileInputStream(file); - HSSFWorkbook hssfWorkbook = new HSSFWorkbook(inputStream); - ) - { + try ( + InputStream inputStream = new FileInputStream(file); + HSSFWorkbook hssfWorkbook = new HSSFWorkbook(inputStream); + ) { // Read the Sheet for (int numSheet = 0; numSheet < hssfWorkbook.getNumberOfSheets(); numSheet++) { HSSFSheet hssfSheet = hssfWorkbook.getSheetAt(numSheet); @@ -161,88 +219,105 @@ } } } catch (Exception e) { - e.printStackTrace(); + throw e; } return info.toString(); } - + /** - * xlsx杞瓧绗� + * xlsx杞瓧绗� + * * @param file * @return * @throws IOException */ - public static String readXlsx(File file) { + public static String readXlsx(File file) throws IOException { StringBuffer info = new StringBuffer(); - try( + try ( InputStream is = new FileInputStream(file); XSSFWorkbook xssfWorkbook = new XSSFWorkbook(is); - ){ - - for (int numSheet = 0; numSheet < xssfWorkbook.getNumberOfSheets(); numSheet++) { - XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(numSheet); - if (xssfSheet == null) { - continue; - } - // Read the Row - for (int rowNum = 0; rowNum <= xssfSheet.getLastRowNum(); rowNum++) { - XSSFRow xssfRow = xssfSheet.getRow(rowNum); - if (xssfRow != null) { - int tdLength = xssfRow.getLastCellNum(); - for (int j = 0; j <= tdLength; j++) { - XSSFCell no = xssfRow.getCell(j); - info.append(getValue(no)).append(" "); + ) { + + for (int numSheet = 0; numSheet < xssfWorkbook.getNumberOfSheets(); numSheet++) { + XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(numSheet); + if (xssfSheet == null) { + continue; + } + // Read the Row + for (int rowNum = 0; rowNum <= xssfSheet.getLastRowNum(); rowNum++) { + XSSFRow xssfRow = xssfSheet.getRow(rowNum); + if (xssfRow != null) { + int tdLength = xssfRow.getLastCellNum(); + for (int j = 0; j <= tdLength; j++) { + XSSFCell no = xssfRow.getCell(j); + info.append(getValue(no)).append(" "); + } } } } - } } catch (Exception e) { - e.printStackTrace(); + throw e; } return info.toString(); } - + /** - * 鑾峰彇xls鍗曞厓鏍煎�� + * 鑾峰彇xls鍗曞厓鏍煎�� + * * @param hssfCell * @return */ private static String getValue(HSSFCell hssfCell) { - if (hssfCell == null) + if (hssfCell == null) { return ""; - if (hssfCell.getCellType() == hssfCell.CELL_TYPE_BOOLEAN) { - return String.valueOf(hssfCell.getBooleanCellValue()); - } else if (hssfCell.getCellType() == hssfCell.CELL_TYPE_NUMERIC) { - return String.valueOf(hssfCell.getNumericCellValue()); - } else { - hssfCell.setCellType(hssfCell.CELL_TYPE_STRING); - return String.valueOf(hssfCell.getStringCellValue()); + } + CellType cellType = hssfCell.getCellType(); + //濡傛灉涓嶆槸瀛楃绫诲瀷鍒欒繑鍥炵┖ + if (!CellType.BOOLEAN.equals(cellType) && !CellType.NUMERIC.equals(cellType) && !CellType.STRING.equals(cellType)) { + return ""; + } + try { + if (CellType.BOOLEAN.equals(cellType)) { + return String.valueOf(hssfCell.getBooleanCellValue()); + } else if (CellType.NUMERIC.equals(cellType)) { + return String.valueOf(hssfCell.getNumericCellValue()); + } else { + return String.valueOf(hssfCell.getStringCellValue()); + } + } catch (Exception e) { + throw e; } } - + /** - * 鑾峰彇xlxs鍗曞厓鏍煎�� + * 鑾峰彇xlxs鍗曞厓鏍煎�� + * * @param xssfRow * @return */ private static String getValue(XSSFCell xssfRow) { if (xssfRow == null) return ""; - if (xssfRow.getCellType() == xssfRow.CELL_TYPE_BOOLEAN) { + CellType cellType = xssfRow.getCellType(); + //濡傛灉涓嶆槸瀛楃绫诲瀷鍒欒繑鍥炵┖ + if (!CellType.BOOLEAN.equals(cellType) && !CellType.NUMERIC.equals(cellType) && !CellType.STRING.equals(cellType)) { + return ""; + } + if (CellType.BOOLEAN.equals(cellType)) { return String.valueOf(xssfRow.getBooleanCellValue()); - } else if (xssfRow.getCellType() == xssfRow.CELL_TYPE_NUMERIC) { + } else if (CellType.NUMERIC.equals(cellType)) { return String.valueOf(xssfRow.getNumericCellValue()); } else { return String.valueOf(xssfRow.getStringCellValue()); } } - - public static void main(String[] args) { - File f1=new File("D://ceshi/a.txt"); - File f2=new File("D://ceshi/b.doc"); - File f3=new File("D://ceshi/c.docx"); - File f4=new File("D://ceshi/d.xls"); - File f5=new File("D://ceshi/e.xlsx"); + + public static void main(String[] args) throws Exception { + File f1 = new File("D://ceshi/a.txt"); + File f2 = new File("D://ceshi/b.doc"); + File f3 = new File("D://ceshi/c.docx"); + File f4 = new File("D://ceshi/d.xls"); + File f5 = new File("D://ceshi/e.xlsx"); System.out.println(txtToString(f1)); System.out.println("+++++++++++++++++++++++++++++++++++++++++++"); System.out.println(docToString(f2)); -- Gitblit v1.9.2