读取 Read_PDF_WORD_EXCEL_PowerPoint_Visio

最新推荐文章于 2024-08-14 13:41:34 发布

原创最新推荐文章于 2024-08-14 13:41:34 发布 · 1.3k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#powerpoint #visio #excel #file #exception #string

Java 专栏收录该内容

8 篇文章

订阅专栏

本文介绍了一个Java程序，能够批量将PDF、Word、Excel、PowerPoint和Visio等文件格式转换为TXT文本格式，并提供了文件复制功能。该程序利用了Apache POI和PDFBox等开源库来实现文档内容的提取。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

如标，读取 Read_PDF_WORD_EXCEL_PowerPoint_Visio 将这些格式转化为 txt 呵呵,放入MP4看.....刚学发贴..不知怎样排版..在运行中遇到CUP居高不下，不知怎样解决...

import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.channels.FileChannel; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; import org.apache.poi.POIOLE2TextExtractor; import org.apache.poi.extractor.ExtractorFactory; import org.apache.poi.hdgf.extractor.VisioTextExtractor; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; public class Read_PDF_WORD_EXCEL_PowerPoint_Visio { private String ReStr ="(//w+|[/u4E00-/u9FFF].*|//w+[/u4E00-/u9FFF].*)//.(xls|doc|ppt|vsd){1}"; private static String path1="C:/Documents and Settings/Administrator/桌面/Tapestry配置"; private static String path2="D:/Backup/我的文档/txt"; private static String path3="D:/Backup/我的文档/Downloads"; private static String[] pathStr={path1,path2,path3}; private Read_PDF read_PDF =null; private Read_WORD_EXCEL_PowerPoint_Visio read_WEPV=null; private CopyTxt copyTxt=null; public static void main(String[] args) { new Read_PDF_WORD_EXCEL_PowerPoint_Visio().file(pathStr); } private void file(String[] pathStr) { for(String path:pathStr){ File file = new File(path); rFile(file); } } private void rFile(File file) { File[] lst = file.listFiles(); for (File f : lst) { if (f.isDirectory()) { rFile(f); } else { String fileName = f.getName(); if (fileName.endsWith(".pdf")) { try { read_PDF= new Read_PDF(f); read_PDF.start(); read_PDF.join(); Thread.sleep(1000); } catch (Exception e) { e.printStackTrace(); } } else if (fileName.matches(ReStr)) { try { read_WEPV = new Read_WORD_EXCEL_PowerPoint_Visio(f); read_WEPV.start(); read_WEPV.join(); Thread.sleep(1000); } catch (Exception e) { e.printStackTrace(); } }else if(fileName.endsWith(".txt")){ try { copyTxt=new CopyTxt(f); copyTxt.start(); copyTxt.join(); Thread.sleep(1000); } catch (Exception e) { e.printStackTrace(); } } } } } } class Read_PDF extends Extend_Thread { File file = null; PDDocument pdf = null; Writer out=null; public Read_PDF(File file) throws Exception { this.file = file; pdf = PDDocument.load(file); out = new OutputStreamWriter(new FileOutputStream(writeFileName(file)),"gb2312"); } @Override public void run() { synchronized (file) { try { PDFTextStripper s = new PDFTextStripper(); s.setStartPage(1); s.setEndPage(Integer.MAX_VALUE); if (!out.equals("")) { s.writeText(pdf, out); } out.flush(); // file.delete(); } catch (Exception e) { e.printStackTrace(); } finally { try { if (pdf != null) { pdf.close(); pdf = null; } if (out != null) { out.close(); out = null; } } catch (IOException e) { e.printStackTrace(); } } } } } class CopyTxt extends Extend_Thread{ File file = null; FileChannel srcChannel =null; FileChannel dstChannel =null; public CopyTxt(File file) throws Exception { this.file = file; } public void run(){ synchronized (file) { try { srcChannel = new FileInputStream(file.getAbsoluteFile()).getChannel(); // Create channel on the destination dstChannel = new FileOutputStream(writeFileName(file)).getChannel(); // Copy file contents from source to destination dstChannel.transferFrom(srcChannel, 0, srcChannel.size()); // Close the channels srcChannel.close(); dstChannel.close(); file.delete(); } catch (IOException e) { e.printStackTrace(); }finally{ try { if(srcChannel!=null){ srcChannel.close(); } if(dstChannel!=null){ dstChannel.close(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } } class Read_WORD_EXCEL_PowerPoint_Visio extends Extend_Thread { File file = null; Writer out = null; public Read_WORD_EXCEL_PowerPoint_Visio(File file) throws Exception { this.file = file; out = new OutputStreamWriter(new FileOutputStream(writeFileName(file)),"gb2312"); } /** * from PIO http://poi.apache.org/text-extraction.html THANKS Read Word * Excel PowerPoint Visio */ @Override public void run() { synchronized (file) { try { FileInputStream fis = new FileInputStream(file); POIFSFileSystem fileSystem = new POIFSFileSystem(fis); // Firstly, get an extractor for the Workbook POIOLE2TextExtractor oleTextExtractor = ExtractorFactory.createExtractor(fileSystem); if (oleTextExtractor instanceof ExcelExtractor) { ExcelExtractor excelExtractor = (ExcelExtractor) oleTextExtractor; // Should cell comments(注解) be included? Default is false excelExtractor.setIncludeCellComments(true); // Should sheet names be included? Default is true excelExtractor.setIncludeSheetNames(true); out.write(excelExtractor.getText()); out.flush(); } // A Word Document /** * Display the document's header and footer text: * wordExtractor.getFooterText(),wordExtractor.getHeaderText() */ else if (oleTextExtractor instanceof WordExtractor) { WordExtractor wordExtractor = (WordExtractor) oleTextExtractor; String[] paragraphText = wordExtractor.getParagraphText(); for (String paragraph : paragraphText) { out.write(paragraph); out.flush(); } } // PowerPoint Presentation. /* * Fetches all the notes text from the slideshow, but not the * slide text : powerPointExtractor.getNotes() */ else if (oleTextExtractor instanceof PowerPointExtractor) { PowerPointExtractor powerPointExtractor = (PowerPointExtractor) oleTextExtractor; // Should a call to getText() return comments(注解) text? Default is no powerPointExtractor.setCommentsByDefault(true); // Should a call to getText() return notes(笔记) text? Default is no powerPointExtractor.setNotesByDefault(true); // Should a call to getText() return slide(滑动) text? Default is yes powerPointExtractor.setSlidesByDefault(true); out.write(powerPointExtractor.getText()); out.flush(); } // Visio Drawing else if (oleTextExtractor instanceof VisioTextExtractor) { VisioTextExtractor visioTextExtractor = (VisioTextExtractor) oleTextExtractor; // Locates all the text entries in the file, and returns their contents(目录) : visioTextExtractor.getAllText() // Returns the textual(正文) contents of the file : visioTextExtractor.getText() out.write(visioTextExtractor.getText()); out.flush(); } file.delete(); } catch (Exception e) { e.printStackTrace(); } finally { if (out != null) { try { out.close(); } catch (IOException e) { e.printStackTrace(); } out = null; } } } } }