View Javadoc

1   /*
2   Copyright 2010 James Pether Sörling Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. 
3   	$Id
4   */
5   
6   package com.hack23.cia.service.impl.agent.sweden;
7   
8   import java.text.SimpleDateFormat;
9   import java.util.ArrayList;
10  import java.util.Date;
11  import java.util.Iterator;
12  import java.util.List;
13  
14  import org.apache.commons.logging.Log;
15  import org.apache.commons.logging.LogFactory;
16  
17  import com.gargoylesoftware.htmlunit.WebClient;
18  import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
19  import com.gargoylesoftware.htmlunit.html.HtmlElement;
20  import com.gargoylesoftware.htmlunit.html.HtmlPage;
21  import com.gargoylesoftware.htmlunit.html.HtmlTable;
22  import com.gargoylesoftware.htmlunit.html.HtmlTableRow;
23  import com.hack23.cia.model.sweden.impl.CommitteeReport;
24  
25  /***
26   * The Class CommitteeReportAgentImpl.
27   */
28  public class CommitteeReportAgentImpl implements CommitteeReportAgent {
29  
30      /***
31       * The Class DocumentAnswerPage.
32       */
33      class DocumentAnswerPage {
34  
35          /*** The anchors. */
36          private List<HtmlAnchor> anchors;;
37  
38          /*** The next page link row. */
39          private HtmlTableRow nextPageLinkRow = null;
40  
41          /***
42           * Instantiates a new document answer page.
43           *
44           * @param page the page
45           */
46          public DocumentAnswerPage(final HtmlPage page) {
47              try {
48                  final HtmlElement answerDiv = page.getHtmlElementById(ANSWER);
49                  final Iterator<HtmlElement> iterator = answerDiv
50                          .getHtmlElementsByTagName(TABLE).iterator();
51                  if (iterator.hasNext()) {
52                      final HtmlTable table = (HtmlTable) iterator.next();
53  
54                      anchors = table.getHtmlElementsByTagName(ANCHOR);
55  
56                      final List<HtmlTableRow> rows = new ArrayList<HtmlTableRow>(table
57                              .getRows());
58                      rows.remove(0);
59                      nextPageLinkRow = rows.remove(0);
60                      rows.remove(rows.size() - 1);
61                      return;
62                  } else {
63                      LOGGER
64                              .warn("Problem with page : " + page.getPage().getTitleText() + "\n\n" + page.asXml()); //$NON-NLS-1$
65                  }
66              } catch (final Exception e) {
67                  LOGGER.warn(e);
68              }
69          }
70  
71          /***
72           * Gets the anchors.
73           *
74           * @return the anchors
75           */
76          public List<HtmlAnchor> getAnchors() {
77              return anchors;
78          }
79  
80          /***
81           * Gets the next page.
82           *
83           * @return the next page
84           * @throws Exception the exception
85           */
86          public DocumentAnswerPage getNextPage() throws Exception {
87              if (nextPageLinkRow != null) {
88                  final List<HtmlAnchor> anchors = nextPageLinkRow
89                          .getHtmlElementsByTagName(ANCHOR);
90  
91                  for (final HtmlAnchor anchor : anchors) {
92                      if (NEXT.equals(anchor.asText())
93                              || NEXT_VERSION2.equals(anchor.asText())) {
94                          return new DocumentAnswerPage((HtmlPage) anchor.click());
95                      }
96                  }
97              }
98              return null;
99          }
100     }
101 
102     /*** The Constant ANCHOR. */
103     private static final String ANCHOR = "a"; //$NON-NLS-1$
104 
105     /*** The Constant ANSWER. */
106     private static final String ANSWER = "svar"; //$NON-NLS-1$
107 
108     /*** The Constant CENTER_PADDING. */
109     private static final String CENTER_PADDING = "centerPadding"; //$NON-NLS-1$
110 
111     /*** The Constant CLASS. */
112     private static final String CLASS = "class"; //$NON-NLS-1$
113 
114     /*** The Constant COMMITEE_REPORTS_CONTAIN. */
115     private static final String COMMITEE_REPORTS_CONTAIN = "http://www.riksdagen.se/webbnav/?nid=3120&doktyp=betankande&bet"; //$NON-NLS-1$
116 
117     /*** The Constant COMMITEE_REPORTS_PERIOD_2007_08. */
118     private static final String COMMITEE_REPORTS_PERIOD_2010_11 = "http://www.riksdagen.se/webbnav/index.aspx?nid=3110&titel=&rm=2010%2F11&bet=&doktyp=bet%C3%A4nkande&org=&s=S%C3%B6k#t%22"; //$NON-NLS-1$
119 
120     /*** The Constant DECISION. */
121     private static final String DECISION = "Beslut:"; //$NON-NLS-1$
122 
123     /*** The Constant DIV. */
124     private static final String DIV = "div"; //$NON-NLS-1$
125 
126     /*** The Constant LOGGER. */
127     private static final Log LOGGER = LogFactory
128             .getLog(CommitteeReportAgentImpl.class);
129 
130     /*** The Constant NEXT. */
131     private static final String NEXT = "nästa sida >"; //$NON-NLS-1$
132 
133     /*** The Constant NEXT_VERSION2. */
134     private static final String NEXT_VERSION2 = "nästa >"; //$NON-NLS-1$
135 
136     /*** The Constant NORMAL. */
137     private static final String NORMAL = "normal"; //$NON-NLS-1$
138 
139     /*** The Constant PARLIAMENT_DECISION. */
140     private static final String PARLIAMENT_DECISION = "Riksdagens beslut"; //$NON-NLS-1$
141 
142     /*** The Constant SPAN. */
143     private static final String SPAN = "span"; //$NON-NLS-1$
144 
145     /*** The Constant TABLE. */
146     private static final String TABLE = "table"; //$NON-NLS-1$
147 
148     /*** The format. */
149     private final SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); //$NON-NLS-1$
150 
151     /*** The web client. */
152     private final WebClient webClient;
153 
154     /***
155      * Instantiates a new committee report agent impl.
156      *
157      * @param webClient the web client
158      */
159     public CommitteeReportAgentImpl(final WebClient webClient) {
160         super();
161         this.webClient = webClient;
162         this.webClient.setJavaScriptEnabled(false);
163     }
164 
165 
166     /*
167      * (non-Javadoc)
168      * 
169      * @see
170      * com.hack23.cia.service.agent.sweden.CommiteeReportAgent#getCurrentList()
171      */
172     @Override
173 	public final List<CommitteeReport> getCurrentList() throws Exception {
174         final List<CommitteeReport> resultat = new ArrayList<CommitteeReport>();
175 
176         DocumentAnswerPage answerPage = new DocumentAnswerPage(
177                 (HtmlPage) webClient.getPage(COMMITEE_REPORTS_PERIOD_2010_11));
178 
179         while (answerPage != null) {
180             for (final HtmlAnchor anchor : answerPage.getAnchors()) {
181 
182                 if (anchor.getHrefAttribute()
183                         .contains(COMMITEE_REPORTS_CONTAIN)) {
184 
185                     final CommitteeReport commiteeReport = new CommitteeReport();
186                     commiteeReport.setName(anchor.asText());
187                     commiteeReport.setHref(anchor.getHrefAttribute());
188                     resultat.add(commiteeReport);
189                 }
190             }
191             if (answerPage != null) {
192                 answerPage = answerPage.getNextPage();
193             }
194         }
195         LOGGER.info("CommiteeReports found : " + resultat.size()); //$NON-NLS-1$
196         return resultat;
197     }
198 
199     /*
200      * (non-Javadoc)
201      * 
202      * @see
203      * com.hack23.cia.service.agent.sweden.CommiteeReportAgent#getDecidedDateIfAny
204      * (com.hack23.cia.model.sweden.CommiteeReport)
205      */
206     @Override
207 	public final Date getDecidedDateIfAny(final CommitteeReport commiteeReport)
208             throws Exception {
209         final HtmlPage page = (HtmlPage) webClient.getPage(commiteeReport.getHref());
210         final HtmlElement contentDiv = page.getDocumentElement()
211                 .getElementsByAttribute(DIV, CLASS, CENTER_PADDING).iterator()
212                 .next();
213 
214         final List<HtmlElement> contentBlocks = contentDiv.getElementsByAttribute(
215                 SPAN, CLASS, NORMAL);
216 
217         LOGGER
218                 .info("Checking if decision has been made  " + commiteeReport.getHref()); //$NON-NLS-1$
219         for (final HtmlElement element : contentBlocks) {
220             final String str = element.asText().trim();
221             if (str.startsWith(PARLIAMENT_DECISION)) {
222 
223                 final int startIndex = str.indexOf(DECISION);
224 
225                 if (startIndex >= 0) {
226                     final String dateStr = str.substring(startIndex + 8,
227                             startIndex + 18).replace("/", "-"); //$NON-NLS-1$ //$NON-NLS-2$
228 
229                     return parseDate(dateStr);
230                 } else {
231                     return null;
232                 }
233             }
234         }
235         return null;
236     }
237 
238     /***
239      * Parses the date.
240      *
241      * @param dateStr the date str
242      * @return the date
243      */
244     private Date parseDate(final String dateStr) {
245         try {
246             return format.parse(dateStr);
247         } catch (final Exception pe) {
248             LOGGER.warn("Problem parsing date ;" + dateStr, pe); //$NON-NLS-1$
249         }
250         return null;
251     }
252 }