Apache poi's default opening a workbook using WorkbookFactory.create or new XSSFWorkbook will always parsing the whole workbook inclusive all sheets. If the workbook contains much data this leads to high memory usage. Opening the workbook using a File instead of a InputStream decreases the memory usage. But this leads to other problems since the used file then cannot be overwritten, at least not when *.xlsx files.
There is XSSF and SAX (Event API) which get at the underlying XML data, and process using SAX.
But if we are already at this level where we get at the underlying XML data, and process it, then we could go one more step back too.
A *.xlsx file is a ZIP archive containing the data in XML files within a directory structure. So we can unzip the *.xlsx file and get the data from the XML files then.
There is /xl/sharedStrings.xml having all the string cell values in it. And there is /xl/workbook.xml describing the workbook structure. And there are /xl/worksheets/sheet1.xml, /xl/worksheets/sheet2.xml, ... which are storing the sheets' data. And there is /xl/styles.xml having the style settings for all cells in the sheets.
So all we need is working with ZIP file system using Java. This is supported using java.nio.file.FileSystems.
And we need a possibility for parsing XML. There Package javax.xml.stream is my favorite.
The following shows a working draft. It parses the /xl/sharedStrings.xml. Also it parses the /xl/styles.xml. But it gets only the number formats and the cell number format settings. The number format settings are essential for detecting date / time values. It then parses the /xl/worksheets/sheet1.xml which contains the data of the first sheet. For detecting whether a number format is a date format, and so the formatted cell contains a date / time value, one single apache poi class org.apache.poi.ss.usermodel.DateUtil is used. This is done to simplify the code. Of course even this class we could have coded ourself.
import java.nio.file.Paths;
import java.nio.file.Path;
import java.nio.file.Files;
import java.nio.file.FileSystems;
import java.nio.file.FileSystem;
import javax.xml.stream.*;
import javax.xml.stream.events.*;
import javax.xml.namespace.QName;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.Date;
import org.apache.poi.ss.usermodel.DateUtil;
public class UnZipAndReadXLSXFileSystem {
public static void main (String args[]) throws Exception {
XMLEventReader reader = null;
XMLEvent event = null;
Attribute attribute = null;
StartElement startElement = null;
EndElement endElement = null;
String characters = null;
StringBuilder stringValue = new StringBuilder(); //for collecting the characters to complete values
List<String> sharedStrings = new ArrayList<String>(); //list of shared strings
Map<String, String> numberFormats = new HashMap<String, String>(); //map of number formats
List<String> cellNumberFormats = new ArrayList<String>(); //list of cell number formats
Path source = Paths.get("ExcelExample.xlsx"); //path to the Excel file
FileSystem fs = FileSystems.newFileSystem(source, null); //get filesystem of Excel file
//get shared strings ==============================================================================
Path sharedStringsTable = fs.getPath("/xl/sharedStrings.xml");
reader = XMLInputFactory.newInstance().createXMLEventReader(Files.newInputStream(sharedStringsTable));
boolean siFound = false;
while (reader.hasNext()) {
event = (XMLEvent)reader.next();
if (event.isStartElement()){
startElement = (StartElement)event;
if (startElement.getName().getLocalPart().equalsIgnoreCase("si")) {
//start element of shared string item
siFound = true;
stringValue = new StringBuilder();
}
} else if (event.isCharacters() && siFound) {
//chars of the shared string item
characters = event.asCharacters().getData();
stringValue.append(characters);
} else if (event.isEndElement() ) {
endElement = (EndElement)event;
if (endElement.getName().getLocalPart().equalsIgnoreCase("si")) {
//end element of shared string item
siFound = false;
sharedStrings.add(stringValue.toString());
}
}
}
reader.close();
System.out.println(sharedStrings);
//shared strings ==================================================================================
//get styles, number formats are essential for detecting date / time values =======================
Path styles = fs.getPath("/xl/styles.xml");
reader = XMLInputFactory.newInstance().createXMLEventReader(Files.newInputStream(styles));
boolean cellXfsFound = false;
while (reader.hasNext()) {
event = (XMLEvent)reader.next();
if (event.isStartElement()){
startElement = (StartElement)event;
if (startElement.getName().getLocalPart().equalsIgnoreCase("numFmt")) {
//start element of number format
attribute = startElement.getAttributeByName(new QName("numFmtId"));
String numFmtId = attribute.getValue();
attribute = startElement.getAttributeByName(new QName("formatCode"));
numberFormats.put(numFmtId, ((attribute != null)?attribute.getValue():"null"));
} else if (startElement.getName().getLocalPart().equalsIgnoreCase("cellXfs")) {
//start element of cell format setting
cellXfsFound = true;
} else if (startElement.getName().getLocalPart().equalsIgnoreCase("xf") && cellXfsFound ) {
//start element of format setting in cell format setting
attribute = startElement.getAttributeByName(new QName("numFmtId"));
cellNumberFormats.add(((attribute != null)?attribute.getValue():"null"));
}
} else if (event.isEndElement() ) {
endElement = (EndElement)event;
if (endElement.getName().getLocalPart().equalsIgnoreCase("cellXfs")) {
//end element of cell format setting
cellXfsFound = false;
}
}
}
reader.close();
System.out.println(numberFormats);
System.out.println(cellNumberFormats);
//styles ==========================================================================================
//get sheet data of first sheet ===================================================================
Path sheet1 = fs.getPath("/xl/worksheets/sheet1.xml");
reader = XMLInputFactory.newInstance().createXMLEventReader(Files.newInputStream(sheet1));
boolean rowFound = false;
boolean cellFound = false;
boolean cellValueFound = false;
boolean inlineStringFound = false;
String cellStyle = null;
String cellType = null;
while (reader.hasNext()) {
event = (XMLEvent)reader.next();
if (event.isStartElement()){
startElement = (StartElement)event;
if (startElement.getName().getLocalPart().equalsIgnoreCase("row")) {
//start element of row
rowFound = true;
System.out.print("<Row");
attribute = startElement.getAttributeByName(new QName("r"));
System.out.print(" r=" + ((attribute != null)?attribute.getValue():"null"));
System.out.println(">");
} else if (startElement.getName().getLocalPart().equalsIgnoreCase("c") && rowFound) {
//start element of cell in row
cellFound = true;
System.out.print("<Cell");
attribute = startElement.getAttributeByName(new QName("r"));
System.out.print(" r=" + ((attribute != null)?attribute.getValue():"null"));
attribute = startElement.getAttributeByName(new QName("t"));
System.out.print(" t=" + ((attribute != null)?attribute.getValue():"null"));
cellType = ((attribute != null)?attribute.getValue():null);
attribute = startElement.getAttributeByName(new QName("s"));
System.out.print(" s=" + ((attribute != null)?attribute.getValue():"null"));
cellStyle = ((attribute != null)?attribute.getValue():null);
System.out.print(">");
} else if (startElement.getName().getLocalPart().equalsIgnoreCase("v") && cellFound) {
//start element of value in cell
cellValueFound = true;
System.out.print("<V>");
stringValue = new StringBuilder();
} else if (startElement.getName().getLocalPart().equalsIgnoreCase("is") && cellFound) {
//start element of inline string in cell
inlineStringFound = true;
System.out.print("<Is>");
stringValue = new StringBuilder();
}
} else if (event.isCharacters() && cellFound && (cellValueFound || inlineStringFound)) {
//chars of the cell value or the inline string
characters = event.asCharacters().getData();
stringValue.append(characters);
} else if (event.isEndElement()) {
endElement = (EndElement)event;
if (endElement.getName().getLocalPart().equalsIgnoreCase("row")) {
//end element of row
rowFound = false;
System.out.println("</Row>");
} else if (endElement.getName().getLocalPart().equalsIgnoreCase("c")) {
//end element of cell
cellFound = false;
System.out.println("</Cell>");
} else if (endElement.getName().getLocalPart().equalsIgnoreCase("v")) {
//end element of value
cellValueFound = false;
String cellValue = stringValue.toString();
if ("s".equals(cellType)) {
cellValue = sharedStrings.get(Integer.valueOf(cellValue));
}
if (cellStyle != null) {
int s = Integer.valueOf(cellStyle);
String formatIndex = cellNumberFormats.get(s);
String formatString = numberFormats.get(formatIndex);
if (DateUtil.isADateFormat(Integer.valueOf(formatIndex), formatString)) {
double dDate = Double.parseDouble(cellValue);
Date date = DateUtil.getJavaDate(dDate);
cellValue = date.toString();
}
}
System.out.print(cellValue);
System.out.print("</V>");
} else if (endElement.getName().getLocalPart().equalsIgnoreCase("is")) {
//end element of inline string
inlineStringFound = false;
String cellValue = stringValue.toString();
System.out.print(cellValue);
System.out.print("</Is>");
}
}
}
reader.close();
//sheet data ======================================================================================
fs.close();
}
}