I have a large word document (over 10,000 lines) containing a table of information which must be converted to excel using Java. I am using apache poi to extract the table and to save it to excel. I have the following code and it functions on a subset of rows on an iMac. However, I get a Heap Space exception when running the code on the full document:
public class WordExtractor {
  public static void main(String[] args) {
    try {
      File inputFile = new File("table.docx");
      POITextExtractor extractor = ExtractorFactory.createExtractor(inputFile);
      String text = extractor.getText();
      BufferedReader reader = new BufferedReader(new StringReader(text));
      String line = null;
      boolean breakRead = false;
      int rowCount = 0;
      HSSFWorkbook workbook = new HSSFWorkbook();
      HSSFSheet sheet = workbook.createSheet("sheet1");
      while (!breakRead) {
        line = reader.readLine();
        if (line != null) {
          Row row = sheet.createRow(rowCount);
          StringTokenizer st = new StringTokenizer(line, "\t");
          int cellnum = 0;
          while (st.hasMoreTokens()) {
            Cell cell = row.createCell(cellnum++);
            String token = st.nextToken();
            System.out.println(" = " + token);
            cell.setCellValue(token);
          }
        } else {
          breakRead = true;
        }
        rowCount++;
      }
       try {
         FileOutputStream out =
         new FileOutputStream(new File("new.xls"));
         workbook.write(out);
         out.close();
       } catch (FileNotFoundException e) {
       e.printStackTrace();
       } catch (IOException e) {
       e.printStackTrace();
       }
    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }
}
 
    