package com.telnext.search; /*JAR NECESSARI per indexing file PDF (tutti presenti nel progetto pdfbox) PDFBox-0.7.3.jar FontBox-0.1.0-dev.jar (org.fontbox.) poi-3.0-alpha3-20070130.jar poi-scratchpad-3.0-alpha3-20070130.jar bcprov-jdk14-132.jar (org.bouncycastle.) bcmail-jdk14-132.jar (org.bouncycastle.) ALTRI elementi non presenti in quello sopra *///import it.rub3.rub.RubUtil; import com.telnext.bean.SecurityBean; import com.telnext.utility.UtilityXML; import java.io.*; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Vector; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.*; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.*; import org.apache.poi.hwpf.model.FileInformationBlock; //import org.jdom.input.SAXBuilder; import org.dom4j.Element; import org.dom4j.Node; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.util.PDFTextStripper; public class SpiderLucene extends com.telnext.bean.BaseBean { private static final String[] ITALIAN_STOP_WORDS = { "di", "a", "da", "in", "con", "su", "per", "tra", "fra", "il", "lo", "la", "i", "gli", "le", "a", "e", "i", "o", "u", "che", "con", "chi", "cosa", "quando", "dove", "perché", "io", "tu", "lui", "lei", "loro", "noi", "voi", "dal", "del", "dalle", "dagli", "agli", "al", "allo", "alle" }; private static final String[] SOURCE_SCHEDA_STOP_WORDS = {}; private String log; private File f; private DataOutputStream dos; public IndexWriter index; public boolean incremental = false; public Vector articoli = new Vector(); public Vector schede = new Vector(); //Vettore di oggetti per spidering public Vector objects = new Vector(); private String luceneIndexPathArt = super.getPath() + "/htdocs/Lucene/Articoli"; private String luceneIndexPathSource = super.getPath() + "/htdocs/Lucene/Schede"; private SecurityBean sb; public void init(String argv[]) throws Exception { sb = new SecurityBean(); sb.setIndirizzo(argv[0]); sb.setUsername(argv[1]); sb.setPassword(argv[2]); sb.setEnvironment(argv[3]); String ids = String.valueOf((new java.util.Random()).nextInt() + 1); sb.setIdsessione(ids); sb.setFun("F(EXB;JASER_02;OGG.LIS) 1(AR;ART;) 2(;;)"); sb.fill(this); this.getAllObjects(); //System.out.println("Articoli:\n" + ArrayUtils.toString(s.objects)); this.spidering("1", "F(EXB;B£SER_09;OAV.DE0) 1(AR;ART;%OBJ%) 2(;;)"); /// FINE INDEXING ARTICOLI /// /// INIZIO INDEXING SOURCE SCHEDE /// this.setFun("F(QRY;JASER_02;OGG.LIS) 1(MB;SCP_SCH;) 2(;;)"); sb.fill(this); this.getAllObjects(); //System.out.println("Schede:\n" + ArrayUtils.toString(s.objects)); this.spideringSource("1", "F(EDT;*EDTLET;) 1(MB;SCP_SCH;%OBJ%) 2(OJ;*LIB;SMEDEV3)"); } public static void main(String argv[]) throws Exception { if (argv.length == 4) { SpiderLucene s = new SpiderLucene("1"); s.init(argv); } } private static void getAllSchede(List righe, SpiderLucene s) { Iterator itr; //s.articoli.clear(); if (righe != null) { itr = righe.iterator(); Node nodo; String valore; String valoreStr = ""; int x = 0; Vector tmp; Iterator itr2; int i; while (itr.hasNext()) { nodo = (Node) itr.next(); if (nodo instanceof Element) { valore = ((Element) nodo).attribute("Fld").getValue(); tmp = UtilityXML.getValuesRighe(valore); itr2 = tmp.iterator(); i = 0; while (itr2.hasNext()) { valoreStr = (String) itr2.next(); if (i == 0) { s.schede.add(valoreStr); } i++; } } x++; } } } private void getAllObjects() { Iterator itr; List righe = this.getXmlRighe(); List uipop = this.getUIPopup(); String next = ""; //Svuoto vettore degli oggetti this.objects.clear(); do { righe = this.getXmlRighe(); uipop = this.getUIPopup(); next = getNext(uipop); if (righe != null) { itr = righe.iterator(); Node nodo; String valore; String valoreStr = ""; int x = 0; Vector tmp; Iterator itr2; int i; while (itr.hasNext()) { nodo = (Node) itr.next(); if (nodo instanceof Element) { valore = ((Element) nodo).attribute("Fld").getValue(); tmp = UtilityXML.getValuesRighe(valore); itr2 = tmp.iterator(); i = 0; while (itr2.hasNext()) { valoreStr = (String) itr2.next(); if (i == 0) { this.objects.add(valoreStr); } i++; } } x++; } } if (!next.equalsIgnoreCase("")) { sb.setFun(next); sb.fill(this); } } while (next != null && !next.equalsIgnoreCase("")); } private static void getAllArticles(List righe, SpiderLucene s) { Iterator itr; //s.articoli.clear(); if (righe != null) { itr = righe.iterator(); Node nodo; String valore; String valoreStr = ""; int x = 0; Vector tmp; Iterator itr2; int i; while (itr.hasNext()) { nodo = (Node) itr.next(); if (nodo instanceof Element) { valore = ((Element) nodo).attribute("Fld").getValue(); tmp = UtilityXML.getValuesRighe(valore); itr2 = tmp.iterator(); i = 0; while (itr2.hasNext()) { valoreStr = (String) itr2.next(); if (i == 0) { s.articoli.add(valoreStr); } i++; } } x++; } } } private static String getNext(List uipop) { Iterator itr; itr = uipop.iterator(); Node nodo; String valore; String exec = ""; while (itr.hasNext()) { nodo = (Node) itr.next(); if (nodo instanceof Element) { valore = ((Element) nodo).attribute("Codice").getValue(); exec = ((Element) nodo).attribute("Exec").getValue(); if (valore.equalsIgnoreCase("*NEXT")) { return exec; } } } return exec; } public SpiderLucene(String idlingua) throws Exception { //spidering(idlingua); } //Implementazione degli indici per gli articoli public void spidering(String idlingua, String fun) throws Exception { log = luceneIndexPathArt + "_log.txt"; f = new File(log); if (!f.exists()) { f.createNewFile(); } dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(log))); HashMap articolo = new HashMap(); System.out.println("Indexing in " + luceneIndexPathArt); dos.writeBytes("Indexing in " + luceneIndexPathArt); index = new IndexWriter(new File(luceneIndexPathArt), new StandardAnalyzer(ITALIAN_STOP_WORDS), !incremental); long start = System.currentTimeMillis(); Iterator itr = this.objects.iterator(); String actArt = ""; Iterator itr1; while (itr.hasNext()) { actArt = itr.next(); sb.setFun(fun.replaceAll("%OBJ%", actArt)); sb.fill(this); List righe = this.getXmlRighe(); //System.out.println("funzione da chiamare: " + this.getFunz()); if (righe != null) { itr1 = righe.iterator(); Node nodo; String valore; int x = 0; Vector tmp; Iterator itr2; int i; while (itr1.hasNext()) { nodo = (Node) itr1.next(); if (nodo instanceof Element) { valore = ((Element) nodo).attribute("Fld").getValue(); tmp = UtilityXML.getValuesRighe(valore); if (tmp.get(0).equalsIgnoreCase("Codice Articolo")) { articolo.put(tmp.get(0), tmp.get(2)); } else { articolo.put(tmp.get(0), tmp.get(3)); } } x++; } } Document docLucene = new Document(); Iterator itr3 = articolo.keySet().iterator(); String key = ""; while (itr3.hasNext()) { key = itr3.next(); //System.out.println("field: " + key + " valore: " + articolo.get(key)); docLucene.add(new Field(key.trim(), articolo.get(key).trim(), Field.Store.YES, Field.Index.TOKENIZED)); } index.addDocument(docLucene); } long elapsed = System.currentTimeMillis() - start; // save the index index.optimize(); index.close(); dos.writeBytes("INDICIZZAZIONE ARTICOLI in " + (elapsed / 1000) + " secondi"); System.out.println("INDICIZZAZIONE ARTICOLI in " + (elapsed / 1000) + " secondi"); dos.close(); //} //catch (Exception e) {throw e;} //finally {//closeResource();} } public void spideringSource(String idlingua, String fun) throws Exception { HashMap source = new HashMap(); log = luceneIndexPathSource + "_log.txt"; f = new File(log); if (!f.exists()) { f.createNewFile(); } dos = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(log))); System.out.println("Indexing in " + luceneIndexPathSource); dos.writeBytes("Indexing in " + luceneIndexPathSource); index = new IndexWriter(new File(luceneIndexPathSource), new StandardAnalyzer(ITALIAN_STOP_WORDS), !incremental); long start = System.currentTimeMillis(); Iterator itr = this.objects.iterator(); String actArt = ""; Iterator itr1; String contEl = ""; String nameSch = ""; while (itr.hasNext()) { actArt = itr.next(); sb.setFun(fun.replaceAll("%OBJ%", actArt)); sb.fill(this); //this.setFunz(fun.replaceAll("%OBJ%", actArt)); //this.leggiXML("1"); //List righe = this.getXmlRighe(); org.dom4j.Document docu = this.getDoc(); contEl = ((Element) docu.selectSingleNode("//" + docu.getRootElement().getName() + "/Contenuto")).getTextTrim(); contEl = contEl.replaceAll("
", "\n"); nameSch = ((Element) docu.selectSingleNode("//" + docu.getRootElement().getName() + "/Service")).attributeValue("Titolo2"); //System.out.println("funzione da chiamare: " + this.getFunz() + " Indicizzo scheda: " + nameSch + "\nSource Scheda: " + contEl); Document docLucene = new Document(); docLucene.add(new Field("Nome Scheda", nameSch, Field.Store.YES, Field.Index.TOKENIZED)); docLucene.add(new Field("Oggetto Scheda", actArt, Field.Store.YES, Field.Index.TOKENIZED)); docLucene.add(new Field("Source", contEl, Field.Store.YES, Field.Index.TOKENIZED)); index.addDocument(docLucene); } long elapsed = System.currentTimeMillis() - start; // save the index index.optimize(); index.close(); dos.writeBytes("INDICIZZAZIONE SOURCE in " + (elapsed / 1000) + " secondi"); System.out.println("INDICIZZAZIONE SOURCE in " + (elapsed / 1000) + " secondi"); dos.close(); //} //catch (Exception e) {throw e;} //finally {//closeResource();} } public String GetFileText(String filename, String estensione) throws Exception { String text = ""; /*contents � sempre nullo... ?!?!? //------------------------------------------------------------------------------------------------ Document documentIndex2 = LucenePDFDocument.getDocument(new File(filename)); StringBuffer sb = new StringBuffer(); for(Enumeration en = documentIndex2.fields(); en.hasMoreElements(); ){ Field pdffield = ((Field)en.nextElement()); System.out.println("\n\nCAMPO: "+pdffield.name()+" - "+pdffield.stringValue()); sb.append(pdffield.stringValue()); } text = sb.toString(); //System.out.println(text); //documentIndex.add(Field.Text("contents",sb.toString())); */ //--------------------------------------------------------------------------- FileInputStream fis = new FileInputStream(filename); if (estensione.equalsIgnoreCase("pdf")) { PDFParser parser = new PDFParser(fis); parser.parse(); PDDocument pdDoc = parser.getPDDocument(); StringWriter stringWriter = new StringWriter(); try { PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(pdDoc, stringWriter); text = stringWriter.toString(); } catch (Exception e) { throw e; } finally { stringWriter.close(); pdDoc.close(); fis.close(); } } if (estensione.equalsIgnoreCase("doc")) { try { if (this.verifyWordFile(filename)) { WordExtractor we = new WordExtractor(fis); text = we.getText(); } } catch (Exception e) { throw e; } finally { fis.close(); } } // System.out.println(text); return text; } //serve a verificare la versione del file di word (o se � corrotto) //infatti se � word95 non funziona la classe WordExtractor //ERRORE PRESENTATO :Table Stream '0Table' wasn't found - Either the document is corrupt, or is Word95 (or earlier) public boolean verifyWordFile(String filename) throws Exception { boolean verify = false; try { POIFSFileSystem filesystem = new POIFSFileSystem(new FileInputStream(filename)); DocumentEntry documentProps = (DocumentEntry) filesystem.getRoot().getEntry("WordDocument"); byte[] _mainStream = new byte[documentProps.getSize()]; filesystem.createDocumentInputStream("WordDocument").read(_mainStream); // use the fib to determine the name of the table stream. FileInformationBlock _fib = new FileInformationBlock(_mainStream); //String name = "0Table"; //if (_fib.isFWhichTblStm()) // name = "1Table"; if (_fib.isFWhichTblStm()) { verify = true; } /*/System.out.println("TABLE STREAM NAME:"+name); // read in the table stream. /*DocumentEntry tableProps = (DocumentEntry) filesystem.getRoot().getEntry(name); _tableStream = new byte[tableProps.getSize()]; filesystem.createDocumentInputStream(name).read(_tableStream);*/ } catch (Exception e) { throw e; } return verify; } public void clear() { } public void load() { } public void edt() { } public void del() { } public void cpy() { } public void ord() { } public void mov() { } public void add() { } }