View Javadoc

1   /*
2   Copyright 2010 James Pether Sörling Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 
3   	$Id
4   */
5   
6   package com.hack23.cia.service.impl.agent.sweden;
7   
8   import gnu.trove.THashMap;
9   
10  import java.util.List;
11  import java.util.Map;
12  
13  import org.apache.commons.logging.Log;
14  import org.apache.commons.logging.LogFactory;
15  
16  import com.gargoylesoftware.htmlunit.WebClient;
17  import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
18  import com.gargoylesoftware.htmlunit.html.HtmlPage;
19  import com.gargoylesoftware.htmlunit.html.HtmlTable;
20  import com.gargoylesoftware.htmlunit.html.HtmlTableCell;
21  import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
22  import com.hack23.cia.model.sweden.impl.ParliamentMember;
23  
24  /***
25   * The Class ParliamentMemberAgentImpl.
26   */
27  public class ParliamentMemberAgentImpl implements ParliamentMemberAgent {
28  
29      /*** The Constant ANCHOR. */
30      private static final String ANCHOR = "a"; //$NON-NLS-1$
31  
32      /*** The Constant LOGGER. */
33      private static final Log LOGGER = LogFactory
34              .getLog(ParliamentMemberAgentImpl.class);
35  
36      /*** The Constant PARLIAMENT_MEMBER_ENGLISH_WIKI_LIST. */
37      private static final String PARLIAMENT_MEMBER_ENGLISH_WIKI_LIST = "http://en.wikipedia.org/wiki/List_of_members_of_the_parliament_of_Sweden,_2010%E2%80%932014"; //$NON-NLS-1$
38  
39      /*** The Constant PARLIAMENT_MEMBER_LIST. */
40      private static final String PARLIAMENT_MEMBER_LIST = "http://www.riksdagen.se/webbnav/index.aspx?fnamn=&enamn=&f_ar=&kn=&party=&electoralRegion=&rdlstatus=&org=&sort=&s=1&nid=1102"; //$NON-NLS-1$
41  
42      /*** The Constant PARLIAMENT_MEMBER_SWEDISH_WIKI_LIST. */
43      private static final String PARLIAMENT_MEMBER_SWEDISH_WIKI_LIST = "http://sv.wikipedia.org/wiki/Lista_%C3%B6ver_ledam%C3%B6ter_av_Sveriges_riksdag_2010%E2%80%932014"; //$NON-NLS-1$
44  
45      /*** The english wiki href map. */
46      private final Map<String, String> englishWikiHrefMap = new THashMap<String, String>();
47  
48      /*** The href map. */
49      private final Map<String, String> hrefMap = new THashMap<String, String>();
50  
51      /*** The web client. */
52      private final WebClient webClient;
53  
54      /*** The wiki href map. */
55      private final Map<String, String> wikiHrefMap = new THashMap<String, String>();
56  
57      /***
58       * Instantiates a new parliament member agent impl.
59       *
60       * @param webClient the web client
61       */
62      public ParliamentMemberAgentImpl(final WebClient webClient) {
63          super();
64          this.webClient = webClient;
65      }
66  
67      /***
68       * Extract name.
69       *
70       * @param anchor the anchor
71       * @return the string
72       */
73      private String extractName(final HtmlAnchor anchor) {
74          String name = anchor.asText().replace(".", "");
75  
76          final String[] split = name.trim().split(" "); //$NON-NLS-1$
77  
78          if (split.length == 2) {
79              name = split[1] + ", " + split[0];
80          } else {
81              name = split[1] + " " + split[2] + ", " + split[0];
82          }
83          return name;
84      }
85  
86      /*
87       * (non-Javadoc)
88       * 
89       * @see
90       * com.hack23.cia.service.agent.sweden.ParliamentMemberAgent#getEnglishWikiHref
91       * (com.hack23.cia.model.sweden.ParliamentMember)
92       */
93      @Override
94      public final String getEnglishWikiHref(
95              final ParliamentMember parliamentMember) {
96          return englishWikiHrefMap.get(parliamentMember.getName());
97      }
98  
99      /*
100      * (non-Javadoc)
101      * 
102      * @see
103      * com.hack23.cia.service.agent.sweden.ParliamentMemberAgent#getHref(com
104      * .hack23.cia.model.sweden.ParliamentMember)
105      */
106     @Override
107     public final String getHref(final ParliamentMember parliamentMember) {
108         return hrefMap.get(parliamentMember.getName());
109     }
110 
111     /*
112      * (non-Javadoc)
113      * 
114      * @see
115      * com.hack23.cia.service.agent.sweden.ParliamentMemberAgent#getWikiHref
116      * (com.hack23.cia.model.sweden.ParliamentMember)
117      */
118     @Override
119     public final String getWikiHref(final ParliamentMember parliamentMember) {
120         return wikiHrefMap.get(parliamentMember.getName());
121     }
122 
123     /*
124      * (non-Javadoc)
125      * 
126      * @see com.hack23.cia.service.agent.sweden.ParliamentMemberAgent#initData()
127      */
128     @Override
129 	public final void initData() {
130         try {
131             final HtmlPage htmlPage = (HtmlPage) webClient
132                     .getPage(PARLIAMENT_MEMBER_LIST);
133             final List<HtmlAnchor> anchors = htmlPage.getDocumentElement()
134                     .getHtmlElementsByTagName(ANCHOR);
135 
136             for (final HtmlAnchor anchor : anchors) {
137                 hrefMap.put(anchor.asText(), anchor.getHrefAttribute());
138                 LOGGER.info("homepage:" + anchor.asText() + " - "
139                         + anchor.getHrefAttribute());
140             }
141         } catch (final Exception e) {
142             LOGGER.warn("Problem Loading Parliament web site info", e); //$NON-NLS-1$
143         }
144 
145         try {
146             final HtmlPage htmlPage = (HtmlPage) webClient
147                     .getPage(PARLIAMENT_MEMBER_SWEDISH_WIKI_LIST);
148             final List<HtmlTable> tables = htmlPage.getDocumentElement().getElementsByAttribute("table", "id",
149                             "sortable_table_id_0");
150             
151             if (tables != null) {
152             	
153             	if (tables.iterator().hasNext()) {
154             	
155 		            final HtmlTable table = tables.iterator().next();
156 		
157 		            final List<HtmlTableRow> rows = table.getRows();
158 		
159 		            for (final HtmlTableRow row : rows) {
160 		                if (row.getCells().size() > 1) {
161 		                    final HtmlTableCell cell = row.getCell(1);
162 		
163 		                    final List<HtmlAnchor> anchors = cell
164 		                            .getHtmlElementsByTagName(ANCHOR);
165 		
166 		                    HtmlAnchor anchor = null;
167 		                    if (anchors.size() > 0) {
168 		                        if (!cell.asText().contains("ersatt av")) {
169 		                            anchor = anchors.get(0);
170 		                        } else {
171 		                            anchor = anchors.get(anchors.size() - 1);
172 		                        }
173 		
174 		                        final String name = extractName(anchor);
175 		
176 		                        final String href = "http://sv.wikipedia.org"
177 		                                + anchor.getHrefAttribute();
178 		
179 		                        LOGGER.info("wiki sv: " + name + " - " + href);
180 		                        wikiHrefMap.put(name, href);
181 		
182 		                    }
183 		                }
184 		            }
185             	}
186             } else {
187             	LOGGER.warn("Problem finding wiki links on page : " + PARLIAMENT_MEMBER_SWEDISH_WIKI_LIST );
188             }
189 
190         } catch (final Exception e) {
191             LOGGER.warn("Problem Loading Parliament Swedish wiki site info", e); //$NON-NLS-1$
192         }
193 
194         try {
195             final HtmlPage htmlPage = (HtmlPage) webClient
196                     .getPage(PARLIAMENT_MEMBER_ENGLISH_WIKI_LIST);
197             final List<HtmlTable> tables = htmlPage.getDocumentElement()
198                     .getElementsByAttribute("table", "class", "wikitable");
199 
200             final HtmlTable table = tables.get(1);
201 
202             final List<HtmlTableRow> rows = table.getRows();
203 
204             for (final HtmlTableRow row : rows) {
205                 if (row.getCells().size() > 2) {
206                     final HtmlTableCell cell = row.getCell(2);
207 
208                     final List<HtmlAnchor> anchors = cell
209                             .getHtmlElementsByTagName(ANCHOR);
210 
211                     HtmlAnchor anchor = null;
212                     if (anchors.size() > 0) {
213                         if (!(cell.asText().contains("substituted") || cell
214                                 .asText().contains("replaced"))) {
215                             anchor = anchors.get(0);
216                         } else {
217                             anchor = anchors.get(anchors.size() - 1);
218                         }
219 
220                         final String name = extractName(anchor);
221 
222                         final String href = "http://en.wikipedia.org"
223                                 + anchor.getHrefAttribute();
224 
225                         LOGGER.info("wiki en: " + name + " - " + href);
226                         englishWikiHrefMap.put(name, href);
227                     }
228                 }
229             }
230 
231         } catch (final Exception e) {
232             LOGGER.warn("Problem Loading Parliament English wiki site info", e); //$NON-NLS-1$
233         }
234     }
235 }