MapReduce来构建索引ITeye - 乐橙lc8

MapReduce来构建索引ITeye

2019-01-11 11:28:53 | 作者: 从寒 | 标签: 索引,运用,技能 | 浏览: 2842

import org.apache.commons.io.output.NullWriter;   import org.apache.hadoop.conf.Configuration;   import org.apache.hadoop.fs.FileSystem;   import org.apache.hadoop.fs.Path;   import org.apache.hadoop.io.IntWritable;   import org.apache.hadoop.io.LongWritable;   import org.apache.hadoop.io.NullWritable;   import org.apache.hadoop.io.Text;   import org.apache.hadoop.mapreduce.Job;   import org.apache.hadoop.mapreduce.Mapper;   import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;   import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;   import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;   import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;   import org.apache.lucene.analysis.Analyzer;   import org.apache.lucene.document.Document;   import org.apache.lucene.document.Field.Store;   import org.apache.lucene.document.StringField;   import org.apache.lucene.document.TextField;   import org.apache.lucene.index.IndexWriter;   import org.apache.lucene.index.IndexWriterConfig;   import org.apache.lucene.util.Version;   import org.apache.solr.store.hdfs.HdfsDirectory;   import org.mortbay.log.Log;   import org.wltea.analyzer.lucene.IKAnalyzer;   import com.qin.wordcount.MyWordCount;    * 运用MapReduce构建索引   * @author qindongliang   * 大数据技能交流群: 376932160   *  查找技能一号群:  324714439   *  查找技能一号群:  206247899   * Hadoop版别2.2.0   * Lucene版别4.8.0   *   Solr版别4.8.0   * **/   public class BuildIndexMapReduce {       /**       * 获取一个IndexWriter       * @param outDir 索引的输出目录       * @return IndexWriter 获取一个IndexWriter       * */       public static IndexWriter  getIndexWriter(String outDir) throws Exception{           Analyzer  analyzer=new IKAnalyzer(true);//IK分词           IndexWriterConfig    config=new IndexWriterConfig(Version.LUCENE_48, analyzer);           Configuration conf=new Configuration();           conf.set("fs.defaultFS","hdfs://192.168.46.32:9000/");//HDFS目录           Path path=new Path("hdfs://192.168.46.32:9000/qin/"+outDir);//索引目录           HdfsDirectory directory=new HdfsDirectory(path, conf);           long heapSize = Runtime.getRuntime().totalMemory()/ 1024L / 1024L;//总内存           long heapMaxSize = Runtime.getRuntime().maxMemory()/ 1024L / 1024L;//运用的最大内存           config.setRAMBufferSizeMB(((heapMaxSize-heapSize)*0.7));//闲暇内存的70%作为兼并因子           IndexWriter writer=new IndexWriter(directory, config);//           return writer;                  }              /**       * 索引的东西类       *        * **/       public static class LuceneDocumentUtil{           public static Document getDoc(String filed,String value){                   Document d=new Document();                   //模仿载入schemal文件,依据solr的scheml文件来灵敏的坐一些索引,                   d.add(new TextField("content", value, Store.YES));               return d;           }                  }       /**       * @author qindongliang       *       */       private static class BuildIndexMapper extends Mapper LongWritable, Text, NullWritable, NullWritable  {                      IndexWriter iw;           List Document  documenst=new ArrayList ();                             @Override       protected void setup(Context context)throws IOException, InterruptedException {           Random rd=new Random();           int i=rd.nextInt(99999999);//此处的索引目录名能够运用UUID来使它仅有           try{           iw=getIndexWriter(i+"");//初始化IndexWriter           }catch(Exception e){               e.printStackTrace();           }                                     }                         @Override           protected void map(LongWritable key, Text value,Context context)                   throws IOException, InterruptedException {           Log.info("  记载的日志信息: "+value.toString());           String values[]=value.toString().split("\1");//此处读入被索引的文件每一行           String fieldName=values[0];           String fieldValue=values[1];           Document d=LuceneDocumentUtil.getDoc(fieldName, fieldValue);           if(d==null){               return;           }           documenst.add(d);           if(documenst.size() 5000){//运用批处理提交               iw.addDocuments(documenst);               documenst.clear();           }                      // context.write(null, null);           }       /***       * 在Map结束时,做一些事,提交索引       *        * */           @Override           protected void cleanup(Context context)throws IOException, InterruptedException {               if(documenst.size() 0){                   iw.addDocuments(documenst);               }               if(iw!=null){               iw.close(true);//封闭至兼并完结               }                          }       }   public static void main(String[] args)throws Exception {              Configuration conf=new Configuration();              conf.set("mapreduce.job.jar", "myjob.jar");       conf.set("fs.defaultFS","hdfs://192.168.46.32:9000");       conf.set("mapreduce.framework.name", "yarn");         conf.set("yarn.resourcemanager.address", "192.168.46.32:8032");        /**Job使命**/      //Job job=new Job(conf, "testwordcount");//抛弃此API      Job job=Job.getInstance(conf, "build index ");       job.setJarByClass(BuildIndexMapReduce.class);       System.out.println("形式:  "+conf.get("yarn.resourcemanager.address"));;       // job.setCombinerClass(PCombine.class);        job.setNumReduceTasks(0);//设置为3        job.setMapperClass(BuildIndexMapper.class);        job.setInputFormatClass(TextInputFormat.class);        job.setOutputFormatClass(TextOutputFormat.class);               job.setMapOutputKeyClass(NullWritable.class);        job.setMapOutputValueClass(NullWritable.class);           String path="hdfs://192.168.46.32:9000/qin/output";           FileSystem fs=FileSystem.get(conf);           Path p=new Path(path);           if(fs.exists(p)){               fs.delete(p, true);               System.out.println("输出途径存在,已删去!");           }       FileInputFormat.setInputPaths(job, "hdfs://192.168.46.32:9000/qin/indexinput");       FileOutputFormat.setOutputPath(job,p );       System.exit(job.waitForCompletion(true) ? 0 : 1);
版权声明
本文来源于网络,版权归原作者所有,其内容与观点不代表乐橙lc8立场。转载文章仅为传播更有价值的信息,如采编人员采编有误或者版权原因,请与我们联系,我们核实后立即修改或删除。

猜您喜欢的文章