From b44bfff58de3adc572fd4643ecb3ec9b5586bea3 Mon Sep 17 00:00:00 2001
From: 许鹏程 <1821349743@qq.com>
Date: 星期二, 28 五月 2024 16:09:03 +0800
Subject: [PATCH] commit

---
 src/main/java/com/product/lucene/util/FileUtils.java |  253 ++++++++++++++++++++++++++++++++-----------------
 1 files changed, 164 insertions(+), 89 deletions(-)

diff --git a/src/main/java/com/product/lucene/util/FileUtils.java b/src/main/java/com/product/lucene/util/FileUtils.java
index be965f3..c7fd7ca 100644
--- a/src/main/java/com/product/lucene/util/FileUtils.java
+++ b/src/main/java/com/product/lucene/util/FileUtils.java
@@ -8,12 +8,14 @@
 import java.io.InputStream;
 import java.util.List;
 
+import com.product.common.lang.StringUtils;
 import org.apache.poi.hssf.usermodel.HSSFCell;
 import org.apache.poi.hssf.usermodel.HSSFRow;
 import org.apache.poi.hssf.usermodel.HSSFSheet;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.usermodel.Range;
+import org.apache.poi.ss.usermodel.CellType;
 import org.apache.poi.xssf.usermodel.XSSFCell;
 import org.apache.poi.xssf.usermodel.XSSFRow;
 import org.apache.poi.xssf.usermodel.XSSFSheet;
@@ -28,83 +30,139 @@
 public class FileUtils {
 
 	/**
-	 * 	鏂囦欢杞瑂tring
+	 * 鏂囦欢杞瑂tring
+	 *
 	 * @param file
 	 * @return
 	 */
 	public static String FileToString(File file) {
-		String file_content="";
-    	if ((file != null) && (file.isFile())) {
-    		if (file.getName().toLowerCase().endsWith(".txt")) {
-    			file_content = txtToString(file);
-			} else if ((file.getName().toLowerCase().endsWith(".doc"))){
-				file_content = docToString(file);
-			} else if ((file.getName().toLowerCase().endsWith(".docx"))) {
-				file_content = docxToString(file);
-			} else if (file.getName().toLowerCase().endsWith(".xls")) {
-				file_content = readXls(file);
-			} else if (file.getName().toLowerCase().endsWith(".xlsx")) {
-				file_content = readXlsx(file);
-			}
+		String file_content = "";
+		String fileType = "unknown";
+		//鑾峰彇鏂囦欢鍚庣紑 淇濈暀.鍙�
+		String fileSuffix = file.getName().substring(file.getName().lastIndexOf("."));
+		if (StringUtils.equalsAny(fileSuffix, ".doc", ".docx", ".xls", ".xlsx")) {
+			//璇诲彇鏂囦欢
+			fileType = checkDocType(file);
 		}
-    	return file_content;
-    }
-	
+		try {
+			if ((file != null) && (file.isFile())) {
+
+
+				if ("doc".equals(fileType)) {
+					file_content = docToString(file);
+				} else if ("docx".equals(fileType)) {
+					file_content = docxToString(file);
+				} else if ("xls".equals(fileType)) {
+					file_content = readXls(file);
+				} else if ("xlsx".equals(fileType)) {
+					file_content = readXlsx(file);
+				} else if (file.getName().toLowerCase().endsWith(".txt")) {
+					file_content = txtToString(file);
+				}
+			}
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+		return file_content;
+	}
+
+
+	public static String checkDocType(File file) {
+		try (FileInputStream fis = new FileInputStream(file)) {
+			byte[] bytes = new byte[8];
+			fis.read(bytes, 0, 8);
+
+			String hex = bytesToHex(bytes);
+
+			if (hex.contains("504B0304") && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) {
+				return "docx";
+			} else if (hex.contains("D0CF11E0") && (file.getName().endsWith(".doc") || file.getName().endsWith(".docx"))) {
+				//鍥犱负doc鏂囦欢鐨勫ご閮ㄤ篃鏄疍0CF11E0锛屾墍浠ラ渶瑕佸垽鏂枃浠跺悗缂�
+				return "doc";
+			}
+			//澧炲姞xls 鍜� xlsx鐨勫垽鏂�
+			else if (hex.contains("504B0304") && (file.getName().endsWith(".xls") || file.getName().endsWith(".xlsx"))) {
+				return "xlsx";
+			} else if (hex.contains("D0CF11E0") && (file.getName().endsWith(".xls") || file.getName().endsWith(".xlsx"))) {
+				//鍥犱负xls鏂囦欢鐨勫ご閮ㄤ篃鏄疍0CF11E0锛屾墍浠ラ渶瑕佸垽鏂枃浠跺悗缂�
+				return "xls";
+			} else {
+				return "unknown";
+			}
+		} catch (
+				IOException e) {
+			e.printStackTrace();
+			return "unknown";
+		}
+	}
+
+	private static String bytesToHex(byte[] bytes) {
+		StringBuilder hex = new StringBuilder();
+		for (byte b : bytes) {
+			hex.append(String.format("%02X", b));
+		}
+		return hex.toString();
+	}
+
 	/**
-	 * 	txt鏂囦欢璇诲彇
+	 * txt鏂囦欢璇诲彇
+	 *
 	 * @param file
 	 * @return
 	 */
-	public static String txtToString(File file) {
+	public static String txtToString(File file) throws IOException {
 		String result = "";
-		try (BufferedReader br = new BufferedReader(new FileReader(file))){// 鏋勯�犱竴涓狟ufferedReader绫绘潵璇诲彇鏂囦欢
+		try (BufferedReader br = new BufferedReader(new FileReader(file))) {// 鏋勯�犱竴涓狟ufferedReader绫绘潵璇诲彇鏂囦欢
 			String s = null;
 			// 浣跨敤readLine鏂规硶锛屼竴娆¤涓�琛�
 			while ((s = br.readLine()) != null) {
 				result = result + "\n" + s;
 			}
 		} catch (Exception e) {
-			e.printStackTrace();
+			throw e;
 		}
 		return result;
 	}
-	
+
 	/**
-	 * 	doc杞枃鏈�
+	 * doc杞枃鏈�
+	 *
 	 * @param file
 	 * @return
 	 */
-	public static String docToString(File file) {
+	public static String docToString(File file) throws IOException {
 		String result = "";
 		try (
-			FileInputStream fileInputStream = new FileInputStream(file);
-			HWPFDocument doc = new HWPFDocument(fileInputStream);
-		){
+				FileInputStream fileInputStream = new FileInputStream(file);
+				HWPFDocument doc = new HWPFDocument(fileInputStream);
+		) {
 			Range rang = doc.getRange();
 			result += rang.text();
 		} catch (Exception e) {
-			e.printStackTrace();
+			throw e;
 		}
 		return result;
 	}
-	
+
 	/**
-	 * 	docx杞枃鏈�
+	 * docx杞枃鏈�
+	 *
 	 * @param file
 	 * @return
 	 * @throws IOException
 	 */
-	public static String docxToString(File file) {
-		StringBuffer s=new StringBuffer();
+	public static String docxToString(File file) throws IOException {
+
+		StringBuffer s = new StringBuffer();
 		try (
-			InputStream inputStream = new FileInputStream(file);
-			XWPFDocument doc = new XWPFDocument(inputStream);
-		){
+				InputStream inputStream = new FileInputStream(file);
+				XWPFDocument doc = new XWPFDocument(inputStream);
+		) {
 			List<XWPFParagraph> paras = doc.getParagraphs();
 			for (XWPFParagraph para : paras) {
 				// 褰撳墠娈佃惤鐨勫睘鎬�
 				// CTPPr pr = para.getCTP().getPPr();
-				s.append( para.getText());
+				s.append(para.getText());
 			}
 			// 鑾峰彇鏂囨。涓墍鏈夌殑琛ㄦ牸
 			List<XWPFTable> tables = doc.getTables();
@@ -119,29 +177,29 @@
 					// 鑾峰彇琛屽搴旂殑鍗曞厓鏍�
 					cells = row.getTableCells();
 					for (XWPFTableCell cell : cells) {
-						s.append( cell.getText());
+						s.append(cell.getText());
 					}
 				}
 			}
 		} catch (Exception e) {
-			e.printStackTrace();
+			throw e;
 		}
 		return s.toString();
 	}
-	
+
 	/**
 	 * `xls杞瓧绗�
-	 * @param f
+	 *
+	 * @param file
 	 * @return
 	 * @throws IOException
 	 */
-	public static String readXls(File file) {
+	public static String readXls(File file) throws IOException {
 		StringBuffer info = new StringBuffer();
-		try(
-			InputStream inputStream = new FileInputStream(file);
-			HSSFWorkbook hssfWorkbook = new HSSFWorkbook(inputStream);
-		)
-		{
+		try (
+				InputStream inputStream = new FileInputStream(file);
+				HSSFWorkbook hssfWorkbook = new HSSFWorkbook(inputStream);
+		) {
 			// Read the Sheet
 			for (int numSheet = 0; numSheet < hssfWorkbook.getNumberOfSheets(); numSheet++) {
 				HSSFSheet hssfSheet = hssfWorkbook.getSheetAt(numSheet);
@@ -161,88 +219,105 @@
 				}
 			}
 		} catch (Exception e) {
-			e.printStackTrace();
+			throw e;
 		}
 		return info.toString();
 	}
-	
+
 	/**
-	 * 	xlsx杞瓧绗�
+	 * xlsx杞瓧绗�
+	 *
 	 * @param file
 	 * @return
 	 * @throws IOException
 	 */
-	public static String readXlsx(File file) {
+	public static String readXlsx(File file) throws IOException {
 		StringBuffer info = new StringBuffer();
-		try(
+		try (
 				InputStream is = new FileInputStream(file);
 				XSSFWorkbook xssfWorkbook = new XSSFWorkbook(is);
-		){
-		
-		for (int numSheet = 0; numSheet < xssfWorkbook.getNumberOfSheets(); numSheet++) {
-			XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(numSheet);
-			if (xssfSheet == null) {
-				continue;
-			}
-			// Read the Row
-			for (int rowNum = 0; rowNum <= xssfSheet.getLastRowNum(); rowNum++) {
-				XSSFRow xssfRow = xssfSheet.getRow(rowNum);
-				if (xssfRow != null) {
-					int tdLength = xssfRow.getLastCellNum();
-					for (int j = 0; j <= tdLength; j++) {
-						XSSFCell no = xssfRow.getCell(j);
-						info.append(getValue(no)).append(" ");
+		) {
+
+			for (int numSheet = 0; numSheet < xssfWorkbook.getNumberOfSheets(); numSheet++) {
+				XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(numSheet);
+				if (xssfSheet == null) {
+					continue;
+				}
+				// Read the Row
+				for (int rowNum = 0; rowNum <= xssfSheet.getLastRowNum(); rowNum++) {
+					XSSFRow xssfRow = xssfSheet.getRow(rowNum);
+					if (xssfRow != null) {
+						int tdLength = xssfRow.getLastCellNum();
+						for (int j = 0; j <= tdLength; j++) {
+							XSSFCell no = xssfRow.getCell(j);
+							info.append(getValue(no)).append(" ");
+						}
 					}
 				}
 			}
-		}
 		} catch (Exception e) {
-			e.printStackTrace();
+			throw e;
 		}
 		return info.toString();
 	}
-	
+
 	/**
-	 * 	鑾峰彇xls鍗曞厓鏍煎��
+	 * 鑾峰彇xls鍗曞厓鏍煎��
+	 *
 	 * @param hssfCell
 	 * @return
 	 */
 	private static String getValue(HSSFCell hssfCell) {
-		if (hssfCell == null)
+		if (hssfCell == null) {
 			return "";
-		if (hssfCell.getCellType() == hssfCell.CELL_TYPE_BOOLEAN) {
-			return String.valueOf(hssfCell.getBooleanCellValue());
-		} else if (hssfCell.getCellType() == hssfCell.CELL_TYPE_NUMERIC) {
-			return String.valueOf(hssfCell.getNumericCellValue());
-		} else {
-			hssfCell.setCellType(hssfCell.CELL_TYPE_STRING);
-			return String.valueOf(hssfCell.getStringCellValue());
+		}
+		CellType cellType = hssfCell.getCellType();
+		//濡傛灉涓嶆槸瀛楃绫诲瀷鍒欒繑鍥炵┖
+		if (!CellType.BOOLEAN.equals(cellType) && !CellType.NUMERIC.equals(cellType) && !CellType.STRING.equals(cellType)) {
+			return "";
+		}
+		try {
+			if (CellType.BOOLEAN.equals(cellType)) {
+				return String.valueOf(hssfCell.getBooleanCellValue());
+			} else if (CellType.NUMERIC.equals(cellType)) {
+				return String.valueOf(hssfCell.getNumericCellValue());
+			} else {
+				return String.valueOf(hssfCell.getStringCellValue());
+			}
+		} catch (Exception e) {
+			throw e;
 		}
 	}
-	
+
 	/**
-	 * 	鑾峰彇xlxs鍗曞厓鏍煎��
+	 * 鑾峰彇xlxs鍗曞厓鏍煎��
+	 *
 	 * @param xssfRow
 	 * @return
 	 */
 	private static String getValue(XSSFCell xssfRow) {
 		if (xssfRow == null)
 			return "";
-		if (xssfRow.getCellType() == xssfRow.CELL_TYPE_BOOLEAN) {
+		CellType cellType = xssfRow.getCellType();
+		//濡傛灉涓嶆槸瀛楃绫诲瀷鍒欒繑鍥炵┖
+		if (!CellType.BOOLEAN.equals(cellType) && !CellType.NUMERIC.equals(cellType) && !CellType.STRING.equals(cellType)) {
+			return "";
+		}
+		if (CellType.BOOLEAN.equals(cellType)) {
 			return String.valueOf(xssfRow.getBooleanCellValue());
-		} else if (xssfRow.getCellType() == xssfRow.CELL_TYPE_NUMERIC) {
+		} else if (CellType.NUMERIC.equals(cellType)) {
 			return String.valueOf(xssfRow.getNumericCellValue());
 		} else {
 			return String.valueOf(xssfRow.getStringCellValue());
 		}
 	}
-	
-	public static void main(String[] args) {
-		File f1=new File("D://ceshi/a.txt");
-		File f2=new File("D://ceshi/b.doc");
-		File f3=new File("D://ceshi/c.docx");
-		File f4=new File("D://ceshi/d.xls");
-		File f5=new File("D://ceshi/e.xlsx");
+
+	public static void main(String[] args) throws Exception {
+		File f1 = new File("D://ceshi/a.txt");
+		File f2 = new File("D://ceshi/b.doc");
+		File f3 = new File("D://ceshi/c.docx");
+		File f4 = new File("D://ceshi/d.xls");
+		File f5 = new File("D://ceshi/e.xlsx");
 		System.out.println(txtToString(f1));
 		System.out.println("+++++++++++++++++++++++++++++++++++++++++++");
 		System.out.println(docToString(f2));

--
Gitblit v1.9.2