《自己动手建搜索引擎》日志分析类代码解析与修正为兼容lucene3.0.2-CFANZ编程社区

搜索日志是用来分析用户搜索行为和信息需求的重要依据。一般记录如下信息：
搜索关键字
用户来源IP
本次搜索返回结果数量
搜索时间
其他需要记录的应用相关信息
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  01 
  : 
  19. 
  2876 
  |DEBUG 
  |blog 
  |title 
  :瑞丽女性网 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  8 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  01 
  : 
  22. 
  1626 
  |DEBUG 
  |blog 
  |title 
  :瑞丽女性网 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  8 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  01 
  : 
  35. 
  0376 
  |DEBUG 
  |blog 
  |的 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  01 
  : 
  44. 
  0688 
  |DEBUG 
  |blog 
  |的 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  03 
  : 
  31. 
  1938 
  |DEBUG 
  |blog 
  |清明,祭奠我所有的过去... 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  1 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  37 
  : 
  19. 
  7720 
  |DEBUG 
  |blog 
  |清明 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  37 
  : 
  39. 
  7563 
  |DEBUG 
  |blog 
  |清明 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  39 
  : 
  08. 
  8657 
  |DEBUG 
  |blog 
  |清明 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  42 
  : 
  12. 
  6313 
  |DEBUG 
  |blog 
  |清明 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  42 
  : 
  19. 
  4282 
  |DEBUG 
  |blog 
  |清明 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  42 
  : 
  46. 
  8657 
  |DEBUG 
  |blog 
  |清明 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  12 
  : 
  48 
  : 
  23. 
  8813 
  |DEBUG 
  |blog 
  |可见要想吃上这里的爆肚还要赶早不赶晚啊 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  1 
  
2008 
  - 
  04 
  - 
  04 
  15 
  : 
  55 
  : 
  57. 
  1470 
  |DEBUG 
  |blog 
  |aaa 
  | 
  127. 
  0. 
  0. 
  1 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  15 
  : 
  57 
  : 
  23. 
  4282 
  |DEBUG 
  |blog 
  |aaa 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  16 
  : 
  06 
  : 
  40. 
  1626 
  |DEBUG 
  |blog 
  |汽车 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
  
2008 
  - 
  04 
  - 
  04 
  16 
  : 
  06 
  : 
  46. 
  7563 
  |DEBUG 
  |blog 
  |汽车 
  | 
  222. 
  130. 
  192. 
  109 
  | 
  10 
 
《自己动手写搜索引擎》日志分析源代码解析：
 
package org.apache.log; 
 
import java.io.BufferedReader; 
 
import java.io.BufferedWriter; 
 
import java.io.File; 
 
import java.io.FileInputStream; 
 
import java.io.FileOutputStream; 
 
import java.io.InputStreamReader; 
 
import java.io.OutputStreamWriter; 
 
import java.util.HashMap; 
 
import java.util.StringTokenizer; 
 
import java.util.HashSet; 
 
//这里是分析搜索日志的部分 
 
public 
 class SearchLog2File { 
 
  
 /**
  * @param args
  */ 
 
  
 public 
 static 
 void main(String[] args) 
 throws Exception { 
 
  String logPath 
 = 
 "E:/Java Projects/ses/src/test/lucene/dic/log"; 
 //日志文件所在目录 
 
  String searchWords 
 = 
 "E:/Java Projects/ses/src/test/lucene/dic/searchword/searchWords1.txt"; 
 //日志分析后数据存放路径 
 
   
 
  logFiler(logPath,searchWords); 
 
 } 
 
  
 
  
 /**
  * 日志分析
  * @param logPath 日志文件存放目录
  * @param searchWords 日志分析后数据存放文件路径
  * @throws Exception
  */ 
 
  
 public 
 static 
 void logFiler(String logPath,String searchWords) 
 throws Exception { 
 
   
 
  FileOutputStream fos 
 = 
 new FileOutputStream(searchWords); 
 
  OutputStreamWriter osw 
 = 
 new OutputStreamWriter(fos, 
 "GBK"); 
 
  BufferedWriter bw 
 = 
 new BufferedWriter(osw); 
 
   
 
  File file 
 = 
 new File(logPath); 
 
  File[] fileArray 
 = file.listFiles(); 
 
   
 
  String readline; 
 //读取一行 
 
  String strIP ; 
 // 存放ＩＰ地址 
 
  String strCont ; 
 // 存放搜索内容 
 
   
 
   
 for ( 
 int i 
 = 
 0; i 
 < fileArray.length; i 
 ++) { 
 
   String fileName 
 = fileArray[i].getName(); 
 
    
 if ( 
 !(fileName.endsWith( 
 ".txt"))) 
 
     
 continue; 
 
   HashMap 
 <String,HashSet 
 <String 
 >> word2IP 
 = 
 
     
 new HashMap 
 <String,HashSet 
 <String 
 >>(); 
 //存放IP 
 
   HashMap 
 <String,Integer 
 > word2ResultNum 
 = 
 new HashMap 
 <String,Integer 
 >(); 
 //存放命中个数 
 
    
 
   String fileDate 
 = fileName.substring( 
 0, 
 10); 
 
   System.out.println(fileDate); 
 
    
 
   FileInputStream fileInputStream 
 = 
 new FileInputStream(logPath 
 + 
 "/" 
 
      
 + fileName); 
 
   InputStreamReader fsr 
 = 
 new InputStreamReader(fileInputStream); 
 
   BufferedReader br 
 = 
 new BufferedReader(fsr); 
 
    
 //处理在日志中经常出现的乱码 
 
    
 while ((readline 
 = br.readLine()) 
 != null) { 
 
     
 if (readline.indexOf( 
 "?") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 ",") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "=") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "骞") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鐭") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "︾") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鐢佃") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "╂") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鐜") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鶶") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "^") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "廸") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "閸") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "嬭") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鍟") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鏂") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "籂") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "濞") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鐑") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "瓙") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "ユ") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "磿") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "嬫") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "傚") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鐥") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "滅") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "閻") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "彛") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "寮") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "儤") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "闁") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "闈") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "湇") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鍏") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "潡") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "庡") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "笅") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鐣") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "冩") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "撳") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鏉") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "彿") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "搧") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "笅") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "鎺") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "闂") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "閺") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "墖") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "夎") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "浜") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "褰") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "锟斤") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 if (readline.indexOf( 
 "AND ") 
 > 
 = 
 0) 
 
      
 continue; 
 
     
 
     
 // 
 
     
 // System.out.println("readline:"+readline); 
 
     
 //日志格式如：2008-04-04 16:06:40.1626|DEBUG|blog|汽车|222.130.192.109|10 
 
    StringTokenizer st 
 = 
 new StringTokenizer(readline, 
 "|"); 
 
     
 
     
 if (st.hasMoreTokens()) { 
 
     st.nextToken(); 
 //2008-04-04 16:06:40.1626 
 
      
 if ( 
 !st.hasMoreTokens()) 
 
       
 continue; 
 
     st.nextToken(); 
 //DEBUG 
 
      
 if ( 
 !st.hasMoreTokens()) 
 
       
 continue; 
 
     st.nextToken(); 
 //blog 
 
      
 if ( 
 !st.hasMoreTokens()) 
 
       
 continue; 
 
     strCont 
 = st.nextToken(); 
 //汽车 
 
      
 
      
 //下面几行代码主要处理：当检索“汽车 汽车”则当做“汽车”；但是也只能处理这种情况，如“汽车 美容 汽车”它也就不会处理成“汽车 美容” 
 
     StringTokenizer stQuery 
 = 
 new StringTokenizer(strCont, 
 " "); 
 
     String key1; 
 
     String key2; 
 
      
 if(stQuery.hasMoreTokens())   
 
     { 
 
      key1 
 = stQuery.nextToken(); 
 
       
 if(stQuery.hasMoreTokens()) 
 
      { 
 
       key2 
 = stQuery.nextToken(); 
 
        
 if(key1.equals(key2)) 
 
       { 
 
        strCont 
 = key1; 
 
         
 //System.out.println(strCont); 
 
       } 
 
      } 
 
     } 
 
      
 if(strCont.length() 
 > 
 20 
 || strCont.length() 
 < 
 2 ) 
 //收录的检索关键字长度限制在2-20字符之间 
 
     { 
 
       
 continue; 
 
     }      
 
      
 if (strCont.indexOf( 
 ":") 
 > 
 = 
 0) 
 //过滤掉这种记录：2008-04-04 12:01:19.2876|DEBUG|blog|title:瑞丽女性网|222.130.192.109|8 
 
       
 continue; 
 
      
 if ( 
 !st.hasMoreTokens()) 
 
       
 continue; 
 
     strIP 
 = st.nextToken(); 
 //222.130.192.109 IP地址 
 
      
 if ( 
 !st.hasMoreTokens()) 
 
       
 continue; 
 
      
 int resultNum 
 = 
 0; 
 
      
 try 
 
     { 
 
      resultNum 
 = Integer.parseInt(st.nextToken()); 
 //10 命中个数 
 
     } 
 
      
 catch(NumberFormatException e) 
 
     {} 
 
      
 
      
 //System.out.println(strIP); 
 
      
 if (word2IP.containsKey(strCont)) { 
 //检索关键字已记录 
 
      HashSet 
 <String 
 > ips 
 = word2IP.get(strCont); 
 
      ips.add(strIP); 
 //则根据IP地址当做频率数，相同IP检索相同的关键字则认作一次操作 
 
     } 
 else 
 if(resultNum 
 > 
 0) { 
 
      HashSet 
 <String 
 > ips 
 = 
 new HashSet 
 <String 
 >(); 
 
      ips.add(strIP); 
 
      word2IP.put(strCont, ips); 
 
      word2ResultNum.put(strCont, resultNum); 
 //记录命中数，这里并没有考虑可能下一次检索出来的命中数与当前命中数不同的情况 
 
     } 
 
    } 
 
   } 
 
    
 
    
 //写入searchWords文件中或插入数据库日志表，在下面可根据自己的要求修改 
 
    
 for (java.util.Map.Entry 
 <String,HashSet 
 <String 
 >> e 
 : 
 
       word2IP.entrySet()) { 
 
     
 //stmt.setString(1, e.getKey()); 
 
     
 //stmt.setInt(2, e.getValue().size()); 
 
     
 //stmt.setInt(3, word2ResultNum.get(e.getKey())); 
 
     
 //stmt.setDate(4, new Date((searchDate).getTime())); 
 
     
 //stmt.executeUpdate(); 
 
     
 
    bw.write(e.getKey() 
 + 
 "%" 
 + 
 
    word2ResultNum.get(e.getKey()) 
 + 
 "%" 
 + 
 
    e.getValue().size()); 
 
    bw.write( 
 "/n"); 
 
     
 
     
 //System.out.println(e.getKey()+"%" + 
 
     
 //  word2ResultNum.get(e.getKey()) + "%" + 
 
     
 //  e.getValue().size()); 
 
     
 
   } 
 
  } 
 
  bw.close(); 
 
 } 
 
}
从代码可看出日志的内容存在很大局限性，可适用于简单日志利用，但很难满足企业搜索或商务网站搜索的需求。