Java读Excel文件

2017-04-20     浏览次数:

如果用XSSFWorkbook 来读取excel,遇到文件太大时候,会导致oom,  OutOfMemoryError: Java heap space

所以apache官网上有推荐 

If memory footprint is an issue, then for XSSF, you can get at the underlying XML data, and process it yourself.

http://poi.apache.org/spreadsheet/how-to.html#xssf_sax_api

通过xml的方式读取excel,一行一行的处理,也不需要将整个文件载入,导致内存不足情况

带注释的官网例子:

package com.datacenter.hbase.adapter;
 
import java.io.InputStream;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFReader.SheetIterator;
import org.apache.poi.xssf.model.SharedStringsTable;
import org.apache.poi.xssf.usermodel.XSSFRichTextString;
import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTRst;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

public class ExcelFileHandler {
	/**
	 * 解析一个sheet 
	 * @param filename
	 * @throws Exception
	 */
	public void processOneSheet(String filename,String sheetName) throws Exception {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader r = new XSSFReader( pkg );
		SharedStringsTable sst = r.getSharedStringsTable(); //打开文件获取文件句柄

		XMLReader parser = fetchSheetParser(sst);

		// To look up the Sheet Name / Sheet Order / rID,
		//  you need to process the core Workbook stream.
		// Normally it's of the form rId# or rSheet#
		InputStream sheet2 = r.getSheet( sheetName );
		InputSource sheetSource = new InputSource(sheet2);
		parser.parse(sheetSource);
		sheet2.close();
	}
	/**
	 * 解析多个sheet
	 * @param filename
	 * @throws Exception
	 */
	public void processAllSheets(String filename) throws Exception {
		OPCPackage pkg = OPCPackage.open(filename);
		XSSFReader r = new XSSFReader( pkg );
		SharedStringsTable sst = r.getSharedStringsTable();
		
		System.out.println( "sheet.size="+sst.getCount() );
		for( CTRst rst : sst.getItems() ){
			//System.out.println( rst.get);
		}
		
		XMLReader parser = fetchSheetParser(sst);

		 XSSFReader.SheetIterator sheets = (SheetIterator) r.getSheetsData();
		while(sheets.hasNext()) {
			System.out.println("Processing new sheet:\n");
			InputStream sheet = sheets.next();
			System.out.println( "sheetname:"+sheets.getSheetName() );;
			InputSource sheetSource = new InputSource(sheet);
			parser.parse(sheetSource);
			sheet.close();
			System.out.println("");
		}
	}

	public XMLReader fetchSheetParser(SharedStringsTable sst) throws SAXException {
		XMLReader parser =
			XMLReaderFactory.createXMLReader(
					"org.apache.xerces.parsers.SAXParser"
			);
		//自定义处理
		ContentHandler handler = new SheetHandler(sst);
		parser.setContentHandler(handler);
		return parser;
	}

	/** 
	 * See org.xml.sax.helpers.DefaultHandler javadocs 
	 * 将excel作为xml来处理 ,类似如下格式
	 * <row>
	 * 	 <cell/>
	 * 	 <cell/>
	 *	 <cell/>
	 *	 <cell/>
	 * </row>
	 * 
	 */
	private static class SheetHandler extends DefaultHandler {
		private SharedStringsTable sst;
		private String lastContents;
		private boolean nextIsString;
		private AtomicInteger counter = new AtomicInteger( 0 ) ;
		
		private SheetHandler(SharedStringsTable sst) {
			this.sst = sst;
		}
		
		public void exist(){
			if( counter .get() == 10 ){
				System.exit( 0 );
			}
		}
		/**
		 * 处理<cell>
		 */
		public void startElement(String uri, String localName, String name,
				Attributes attributes) throws SAXException {
			// c => cell
			if(name.equals("c")) {
				String rowNum = attributes.getValue("r") ;
				// Print the cell reference
				if( rowNum.startsWith("A") ){
					exist() ;
					counter.incrementAndGet() ;
					System.out.print( rowNum + " - "); //坐标
				}
				// Figure out if the value is an index in the SST
				// cellType == null ,if there is empty
				String cellType = attributes.getValue("t"); 
				if(cellType != null && cellType.equals("s")) {
					//下面是否是字符串
					nextIsString = true;
				} else {
					nextIsString = false;
				}
			}
			// Clear contents cache
			lastContents = "";
		}
		/**
		 * 处理结尾的 </cell>
		 */
		public void endElement(String uri, String localName, String name)
				throws SAXException {
			// Process the last contents as required.
			// Do now, as characters() may be called more than once
			if(nextIsString) {
				int idx = Integer.parseInt(lastContents);
				//获取单元格数据
				lastContents = new XSSFRichTextString(sst.getEntryAt(idx)).toString();
				nextIsString = false;
			}
			
			// v => contents of a cell
			// Output after we've seen the string contents
			if(name.equals("v")) {
				//处理每个单元格数据
				if(  lastContents == null  || "".equals( lastContents )  ){
					System.out.print("  #  null");
				}
				System.out.print("  #  "+lastContents);
			}
			if( "row".equals( name ) ){ //一行结束
				System.out.println(); //change a line
			}
		}

		public void characters(char[] ch, int start, int length)
				throws SAXException {
			lastContents += new String(ch, start, length);
		}
	}
	
	public static void main(String[] args) throws Exception {
		String file = "F:\\excel\\IPAddressInfo.xlsx" ;
		ExcelFileHandler example = new ExcelFileHandler();
		//example.processOneSheet( file , "rId1");
		example.processAllSheets( file );
	}
}

上面的代码是没有处理空值的情况的,因为xml读取的时候会跳过该cell,如果要处理空值。

参考:http://www.itbiancheng.com/java/2507.html
*转载请注明出处,更多资料尽在 吾爱编程
返回吾爱编程首页