Any advice velcome.
using this:
http://htmlparser.sourceforge.net/
the problem here is that when I m taking data from 36 pages in loop
for z ... to 36 the things get teribly (unexceptable even at first page) slow with time, although there is only a small bit of data on each page
calendar does not take time, it was slow before I putted it in.
reading and parsing html is also not reason for this.
so the log from below code looks like this:
diff represents time taken to read data from nl (nodelist) and construct something like this:
"species","data1"","data2"","data3"","data4"
Code:
DEBUG - AgroZoo: counting: 1//i.e page 1 from loop
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 0
....
DEBUG - AgroZoo: diff: 10
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 10
DEBUG - AgroZoo: diff: 10
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 10
DEBUG - AgroZoo: diff: 0
DEBUG - AgroZoo: diff: 10
DEBUG - AgroZoo: diff: 0
....
DEBUG - AgroZoo: counting: 10
DEBUG - AgroZoo: diff: 13119
DEBUG - AgroZoo: diff: 240
DEBUG - AgroZoo: diff: 100
DEBUG - AgroZoo: diff: 120
DEBUG - AgroZoo: diff: 251
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 290
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 251
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 250
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 51
DEBUG - AgroZoo: diff: 240
DEBUG - AgroZoo: diff: 60
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 250
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 61
DEBUG - AgroZoo: diff: 250
DEBUG - AgroZoo: diff: 60
DEBUG - AgroZoo: diff: 240
DEBUG - AgroZoo: diff: 70
DEBUG - AgroZoo: diff: 61
DEBUG - AgroZoo: diff: 270
DEBUG - AgroZoo: diff: 50
DEBUG - AgroZoo: diff: 60
DEBUG - AgroZoo: diff: 291
code:
Code:
public static void go(String appPath)
{
File tmpFile = null;
URL url = null;
Parser parser = null;
String tmpStr = "";
int counter = 0;
TagNameFilter tnf = null;
NodeList nl = null;
//NodeList nl1 = null;
String species = "";
NodeClassFilter ncf = null;
HasAttributeFilter haf = null;
long startTime = 0;
long endTime = 0;
try
{
Logger logger = mc.globals.StartUp.getLogger("dataLoaderHTML");
// 36 pages to read
for(int z = 1; z <= 36; z++)
{
//.. etc - code to get desired data from html tnf = filter this part is not slow
nl = parser.parse(tnf);
//here , inside nl, we have TD-s from table with 4 columns ....,
//nl.size() up to 10000 or something
// and this loop gets slower and slower even at z = 1 !!!
for(int i = 4; i < nl.size(); i++)
{
if (nl.elementAt(i).getText().contains("colspan=4"))
{
species = nl.elementAt(i).toPlainTextString().replaceAll(System.getProperty("line.separator"), "").trim();
}
else
{
counter = counter +1;
if (counter == 1)
{
Calendar cal = Calendar.getInstance();
startTime = cal.getTimeInMillis();
tmpStr = tmpStr + "\"" + species + "\",";
}
tmpStr = tmpStr + "\"" + nl.elementAt(i).toPlainTextString().replaceAll(System.getProperty("line.separator"), "").trim() + "\"";
if ((counter == 4) )
{
counter = 0;
tmpStr = tmpStr + System.getProperty("line.separator");
Calendar cal = Calendar.getInstance();
endTime = cal.getTimeInMillis();
logger.debug("AgroZoo:" + " diff: " + (endTime-startTime));
}
else
{
tmpStr = tmpStr + ",";
}
}
}
}
// does not get to here because takes hours
// --> save tmpStr to .csv
}
catch(ParserException ex)
{
Logger logger = mc.globals.StartUp.getLogger("dataLoaderHTML");
logger.debug("AgroZoo:" + ex.getMessage());
//return false;
}
catch(Exception ex)
{
Logger logger = mc.globals.StartUp.getLogger("dataLoaderHTML");
logger.debug("AgroZoo:" + ex.getMessage());
}
}