source: nutchez-0.1/tomcat/webapps/ROOT/cluster.jsp @ 253

Last change on this file since 253 was 66, checked in by waue, 15 years ago

NutchEz - an easy way to nutch

File size: 3.8 KB
RevLine 
[66]1<%--
2  Licensed to the Apache Software Foundation (ASF) under one or more
3  contributor license agreements.  See the NOTICE file distributed with
4  this work for additional information regarding copyright ownership.
5  The ASF licenses this file to You under the Apache License, Version 2.0
6  (the "License"); you may not use this file except in compliance with
7  the License.  You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11  Unless required by applicable law or agreed to in writing, software
12  distributed under the License is distributed on an "AS IS" BASIS,
13  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  See the License for the specific language governing permissions and
15  limitations under the License.
16--%>
17<%
18
19// @author Dawid Weiss
20//
21// PERFORMANCE/USER INTERFACE NOTE:
22//
23// What I do here is merely a demonstration. In real life the clustering
24// process should be done in a separate "processing" stream, most likely
25// a separate HTML frame that the user's browser requests data to.
26// We don't want the user to wait with plain snippets until the clusters
27// are created.
28//
29// Also: clustering is resource consuming, so a cache of recent queries
30// would be in place. Besides, such cache would also be beneficial for the
31// purpose of re-querying existing clusters (remember that the
32// clustering extension may be a heuristic returning a DIFFERENT set of
33// clusters for an identical input).
34// See www.vivisimo.com for details of how this can be done using frames, or
35// http://carrot.cs.put.poznan.pl for an example of a Javascript solution.
36
37// cluster the hits
38HitsCluster [] clusters = null;
39if (clusterer != null) {
40  final long clusteringStart = System.currentTimeMillis();
41  try {
42    clusters = clusterer.clusterHits( details, Summary.toStrings(summaries) );
43    final long clusteringDuration = System.currentTimeMillis() - clusteringStart;
44    bean.LOG.info("Clustering took: " + clusteringDuration + " milliseconds.");
45  } catch (Exception e) {
46    // failed to do clustering (see below)
47  }
48}
49
50if (clusterer == null) {
51  %>No clustering extension found.<%
52} else {
53  if (clusters == null) {
54    %>Unable to do clustering.<%
55  } else if (clusters.length == 0) {
56    %>No clusters found.<%
57  } else {
58    // display top N clusters and top Q documents inside them.
59    int N = 10;
60    int Q = 3;
61    int maxLabels = 2;
62   
63    int displayCounter = 0;
64    N = Math.min(N, clusters.length );
65
66    for (int clusterIndex = 0 ; clusterIndex < N ; clusterIndex++) {
67      HitsCluster cluster = clusters[ clusterIndex ];
68      String [] clusterLabels = cluster.getDescriptionLabels();
69     
70      // probably leave it on for now
71      //if (cluster.isJunkCluster()) continue;
72
73      // output cluster label.
74      %><div style="margin: 0px; padding: 0px; font-weight: bold;"><%
75      for (int k=0;k<maxLabels && k<clusterLabels.length;k++) {
76        if (k>0) out.print(", ");
77        out.print( Entities.encode(clusterLabels[k]) );
78      }
79      %></div><%
80       
81      // now output sample documents from the inside
82      HitDetails[] documents = cluster.getHits();
83      if (documents.length > 0) {
84        %><ul style="font-size: 90%; margin-top: .5em;"><%
85        for (int k = 0; k < Q && k < documents.length; k++) {
86          HitDetails detail = documents[ k ];
87          String title = detail.getValue("title");
88          String url = detail.getValue("url");
89          if (title == null || title.equals("")) title = url;
90          if (title.length() > 35) title = title.substring(0,35) + "...";
91          %>
92            <li><a href="<%=url%>"><%= Entities.encode(title) %></a></li>
93          <%
94        }
95        %></ul><%
96      }
97       
98      // ignore subclusters for now, ALTHOUGH HIERARCHICAL CLUSTERING
99      // METHODS DO EXIST AND ARE VERY USEFUL
100      // HitsCluster [] subclusters = cluster.getSubclusters();
101    }
102  }
103}
104
105%>
Note: See TracBrowser for help on using the repository browser.