I have been tasked with reading large CSV files (300k+ records) and apply regexp patterns to each record. I have always been a PHP developer and never really tried any other languages, but decided I should take the dive and attempt to do this with Java which I assumed would be much faster.
In fact, just reading the CSV file line by line was 3x faster in Java. However, when I applied the regexp requirements, the Java implementation proved to take 10-20% longer than the PHP script.
It is very well possible that I did something wrong in Java, because I just learned this as I went today. Below are the two scripts, any advice would be greatly appreciated. I really would like to not give up on Java for this particular project.
PHP CODE
<?php
$bgtime=time();
$patterns =array(
    "/SOME REGEXP/",
    "/SOME REGEXP/",                    
    "/SOME REGEXP/",    
    "/SOME REGEXP/" 
);   
$fh = fopen('largeCSV.txt','r');
while($currentLineString = fgetcsv($fh, 10000, ","))
{
    foreach($patterns AS $pattern)
    {
        preg_match_all($pattern, $currentLineString[6], $matches);
    }
}
fclose($fh);
print "Execution Time: ".(time()-$bgtime);
?>
JAVA CODE
import au.com.bytecode.opencsv.CSVReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.ArrayList;
public class testParser
{
    public static void main(String[] args)
    {
        long start = System.currentTimeMillis();
        String[] rawPatterns = {
                    "SOME REGEXP",
                    "SOME REGEXP",                    
                    "SOME REGEXP",    
                    "SOME REGEXP"    
        };
        ArrayList<Pattern> compiledPatternList = new ArrayList<Pattern>();        
        for(String patternString : rawPatterns)
        {
            Pattern compiledPattern = Pattern.compile(patternString);
            compiledPatternList.add(compiledPattern);
        }
        try{
            String fileName="largeCSV.txt";
            CSVReader reader = new CSVReader(new FileReader(fileName));
            String[] header = reader.readNext();
            String[] nextLine;
            String description;
            while( (nextLine = reader.readNext()) != null) 
            {
                description = nextLine[6];
                for(Pattern compiledPattern : compiledPatternList)
                {
                    Matcher m = compiledPattern.matcher(description);
                    while(m.find()) 
                    {
                        //System.out.println(m.group(0));
                    }                
                }
            }
        }
        catch(IOException ioe)
        {
            System.out.println("Blah!");
        }
        long end = System.currentTimeMillis();
        System.out.println("Execution time was "+((end-start)/1000)+" seconds.");
    }
}
 
     
     
    

 
     
     
     
    