View Javadoc
1   /*
2    * Copyright 2010 James Pether Sörling
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *   http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   *	$Id$
17   *  $HeadURL$
18  */
19  package com.hack23.cia.service.impl.action.user.wordcount;
20  
21  import java.util.ArrayList;
22  import java.util.HashMap;
23  import java.util.Map;
24  
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  import org.springframework.stereotype.Service;
28  
29  import com.hack23.cia.model.external.riksdagen.documentcontent.impl.DocumentContentData;
30  
31  import weka.core.Attribute;
32  import weka.core.DenseInstance;
33  import weka.core.Instance;
34  import weka.core.Instances;
35  import weka.core.stopwords.StopwordsHandler;
36  import weka.core.tokenizers.NGramTokenizer;
37  import weka.filters.Filter;
38  import weka.filters.unsupervised.attribute.StringToWordVector;
39  
40  /**
41   * The Class WordCounterImpl.
42   */
43  @Service
44  final class WordCounterImpl implements WordCounter {
45  
46  	/** The Constant TOKEN_DELIMITERS. */
47  	private static final String TOKEN_DELIMITERS = " \r\n\t.,;:'\"()?!'";
48  
49  	/** The Constant HTML. */
50  	private static final String HTML = "html";
51  	
52  	/** The Constant LOGGER. */
53  	private static final Logger LOGGER = LoggerFactory
54  			.getLogger(WordCounterImpl.class);
55  
56  
57  	/**
58  	 * Instantiates a new word counter impl.
59  	 */
60  	public WordCounterImpl() {
61  		super();
62  	}
63  
64  	@Override
65  	public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {
66  
67  		final String html = documentContentData.getContent();
68  
69  		final Attribute input = new Attribute(HTML, (ArrayList<String>) null);
70  
71  		final ArrayList<Attribute> inputVec = new ArrayList<>();
72  		inputVec.add(input);
73  
74  		final Instances htmlInst = new Instances(HTML, inputVec, 1);
75  
76  		htmlInst.add(new DenseInstance(1));
77  		htmlInst.instance(0).setValue(0, html);
78  
79  
80  		final StopwordsHandler stopwordsHandler = new StopwordsHandler() {
81  
82  			@Override
83  			public boolean isStopword(final String word) {
84  
85  				return word.length() <5;
86  			}
87  		};
88  
89  		final NGramTokenizer tokenizer = new NGramTokenizer();
90  		tokenizer.setNGramMinSize(1);
91  		tokenizer.setNGramMaxSize(1);
92  		tokenizer.setDelimiters(TOKEN_DELIMITERS);
93  
94  		final StringToWordVector filter = new StringToWordVector();
95  		filter.setTokenizer(tokenizer);
96  		filter.setStopwordsHandler(stopwordsHandler);
97  		filter.setLowerCaseTokens(true);
98  		filter.setOutputWordCounts(true);
99  		filter.setWordsToKeep(maxResult);
100 
101 		final Map<String,Integer> result = new HashMap<>();
102 
103 		try {
104 			filter.setInputFormat(htmlInst);
105 			final Instances dataFiltered = Filter.useFilter(htmlInst, filter);
106 
107 			final Instance last = dataFiltered.lastInstance();
108 
109 			final int numAttributes = last.numAttributes();
110 
111 			for (int i = 0; i < numAttributes; i++) {
112 				result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
113 			}
114 		} catch (final Exception e) {
115 			LOGGER.warn("Problem calculating wordcount for : {} , exception:{}",documentContentData.getId() ,e);
116 		}
117 
118 
119 		return result;
120 	}
121 
122 
123 }