1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package com.hack23.cia.service.impl.action.user.wordcount;
20
21 import java.util.ArrayList;
22 import java.util.HashMap;
23 import java.util.Map;
24
25 import org.slf4j.Logger;
26 import org.slf4j.LoggerFactory;
27 import org.springframework.stereotype.Service;
28
29 import com.hack23.cia.model.external.riksdagen.documentcontent.impl.DocumentContentData;
30
31 import weka.core.Attribute;
32 import weka.core.DenseInstance;
33 import weka.core.Instance;
34 import weka.core.Instances;
35 import weka.core.stopwords.StopwordsHandler;
36 import weka.core.tokenizers.NGramTokenizer;
37 import weka.filters.Filter;
38 import weka.filters.unsupervised.attribute.StringToWordVector;
39
40
41
42
43 @Service
44 final class WordCounterImpl implements WordCounter {
45
46
47 private static final String TOKEN_DELIMITERS = " \r\n\t.,;:'\"()?!'";
48
49
50 private static final String HTML = "html";
51
52
53 private static final Logger LOGGER = LoggerFactory
54 .getLogger(WordCounterImpl.class);
55
56
57
58
59
60 public WordCounterImpl() {
61 super();
62 }
63
64 @Override
65 public Map<String, Integer> calculateWordCount(final DocumentContentData documentContentData, final int maxResult) {
66
67 final String html = documentContentData.getContent();
68
69 final Attribute input = new Attribute(HTML, (ArrayList<String>) null);
70
71 final ArrayList<Attribute> inputVec = new ArrayList<>();
72 inputVec.add(input);
73
74 final Instances htmlInst = new Instances(HTML, inputVec, 1);
75
76 htmlInst.add(new DenseInstance(1));
77 htmlInst.instance(0).setValue(0, html);
78
79
80 final StopwordsHandler stopwordsHandler = new StopwordsHandler() {
81
82 @Override
83 public boolean isStopword(final String word) {
84
85 return word.length() <5;
86 }
87 };
88
89 final NGramTokenizer tokenizer = new NGramTokenizer();
90 tokenizer.setNGramMinSize(1);
91 tokenizer.setNGramMaxSize(1);
92 tokenizer.setDelimiters(TOKEN_DELIMITERS);
93
94 final StringToWordVector filter = new StringToWordVector();
95 filter.setTokenizer(tokenizer);
96 filter.setStopwordsHandler(stopwordsHandler);
97 filter.setLowerCaseTokens(true);
98 filter.setOutputWordCounts(true);
99 filter.setWordsToKeep(maxResult);
100
101 final Map<String,Integer> result = new HashMap<>();
102
103 try {
104 filter.setInputFormat(htmlInst);
105 final Instances dataFiltered = Filter.useFilter(htmlInst, filter);
106
107 final Instance last = dataFiltered.lastInstance();
108
109 final int numAttributes = last.numAttributes();
110
111 for (int i = 0; i < numAttributes; i++) {
112 result.put(last.attribute(i).name(), Integer.valueOf(last.toString(i)));
113 }
114 } catch (final Exception e) {
115 LOGGER.warn("Problem calculating wordcount for : {} , exception:{}",documentContentData.getId() ,e);
116 }
117
118
119 return result;
120 }
121
122
123 }