001/* 002 * Copyright 2007 Marc Wick, geonames.org 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 * 016 */ 017package org.geonames.wikipedia; 018 019/** 020 * @author marc 021 * 022 */ 023public class TextSummaryExtractor { 024 025 /** 026 * extract a text summary from a wikipedia article. The wikipedia markup is 027 * removed. The length of the summary is equal or lower the length of the 028 * parameter. The extractor tries to end the summary at a fullstop. It stops 029 * at a new paragraph. 030 * 031 * @param pText 032 * @param length 033 * @return 034 */ 035 public static String extractSummary(String pText, int length) { 036 return extractSummary(pText, length, true); 037 } 038 039 public static String extractSummary(String pText, int length, boolean stopAtParagraph) { 040 try { 041 return _extractSummary(pText, length, stopAtParagraph); 042 } catch (Error e) { 043 System.err.println(e.getMessage()); 044 return null; 045 } 046 } 047 048 private static String _extractSummary(String pText, int length, boolean stopAtParagraph) { 049 if (pText == null) { 050 return null; 051 } 052 053 String textCopy = new String(pText); 054 055 // remove all wikipedia markup (paragraphs are kept) 056 // 057 StringBuilder summary = new StringBuilder(); 058 int idx = 0; 059 int cnt = 0; 060 061 // loop over all characters in input string 062 while (idx > -1 && (summary.length() < 50 + 2 * length || length == 0) && idx < textCopy.length()) { 063 // get next chacter 064 cnt++; 065 char c = textCopy.charAt(idx); 066 067 if (c == '{') { 068 // skip template and set idx to end of template 069 int endidx = skipTemplate(textCopy, idx); 070 // do we have an audio template? 071 if (textCopy.toLowerCase().indexOf("{{audio") == idx) { 072 int begLabelIdx = textCopy.lastIndexOf("|", endidx); 073 if (begLabelIdx > -1) { 074 String label = textCopy.substring(begLabelIdx + 1, endidx - 2).trim(); 075 summary.append(label); 076 } 077 } 078 if (textCopy.toLowerCase().indexOf("{{formatnum") == idx) { 079 int begLabelIdx = textCopy.indexOf(":", idx); 080 if (begLabelIdx > -1) { 081 082 int endLabelIdx = endidx - 2; 083 if (textCopy.indexOf("|", begLabelIdx) > -1) { 084 endLabelIdx = textCopy.indexOf("|", begLabelIdx); 085 } 086 087 String label = textCopy.substring(begLabelIdx + 1, endLabelIdx).trim(); 088 summary.append(label); 089 } 090 } 091 // skip template and set idx to end of template 092 idx = endidx; 093 continue; 094 } else if (c == '<') { 095 // is it a html comment 096 if (textCopy.length() > idx + 1 && textCopy.charAt(idx + 1) == '!') { 097 // skip html comment 098 idx = skipHTMLComment(textCopy, idx); 099 continue; 100 } else { 101 // html element starts here, skip it, set idx to end of html 102 // element 103 idx = skipHTMLElement(textCopy, idx); 104 continue; 105 } 106 } else if (c == '[') { 107 108 // look ahead to see whether we have a link 109 if (textCopy.charAt(idx + 1) == '[') { 110 // we have two square brackets "[[" (link) 111 112 // get the end of the double square bracket 113 int endOfLink = textCopy.indexOf("]]", idx); 114 115 // image link ? 116 int colon = textCopy.indexOf(":", idx); 117 if (colon > -1 && colon < endOfLink) { 118 // image link contains a caption which might contain 119 // a link within the link 120 idx = findEndOfLink(textCopy, idx); 121 continue; 122 } 123 124 int beginAnchor = textCopy.indexOf("|", idx); 125 if (beginAnchor > -1 && beginAnchor < endOfLink) { 126 idx = beginAnchor + 1; 127 } else { 128 idx = idx + 2; 129 } 130 continue; 131 } else { 132 // next character is not a square brackets and thus a 133 // reference link to be removed 134 // get the end of the square bracket 135 int endOfLink = textCopy.indexOf("]", idx); 136 if (endOfLink > -1) { 137 idx = endOfLink + 1; 138 continue; 139 } 140 } 141 } else if (c == ']') { 142 // look ahead 143 if (idx + 1 < textCopy.length() && textCopy.charAt(idx + 1) == ']') { 144 idx = idx + 2; 145 continue; 146 } 147 } else if (c == '=') { 148 // look ahead 149 if (idx + 1 < textCopy.length() && textCopy.charAt(idx + 1) == '=') { 150 int endHeaderIdx = textCopy.indexOf("==", idx + 2); 151 if (endHeaderIdx > -1) { 152 idx = endHeaderIdx + 2; 153 continue; 154 } 155 } 156 } 157 158 summary.append(c); 159 idx++; 160 } 161 162 String textString = removeIndentAtBeginning(summary.toString()); 163 // remove empty parenthesis 164 textString = textString.replaceAll("\\([^\\w]*\\)", ""); 165 // remove comma in front of parenthesis 166 textString = textString.replaceAll("\\([, ]*", "("); 167 textString = textString.replaceAll("[, ]*\\)", ")"); 168 169 textString = removeWhiteSpace(textString.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " ")) 170 .trim(); 171 172 textString = removeBold(textString); 173 textString = removeItalic(textString); 174 175 // convert 'non breaking html spaces' into blanks. But preserve them 176 // (don't remove white space) 177 textString = textString.replaceAll(" ", " "); 178 textString = textString.replaceAll("\\( ", "("); 179 textString = textString.replaceAll(" \\)", ")"); 180 181 // find full stop near length of text 182 int endOfTextIdx = textString.length(); 183 184 if (stopAtParagraph) { 185 // only look at first paragraph for summary 186 int paragraph = textString.indexOf("=="); 187 if (paragraph > 10) { 188 endOfTextIdx = paragraph; 189 } 190 } 191 192 // 193 if (endOfTextIdx < 20 || endOfTextIdx > length) { 194 endOfTextIdx = textString.lastIndexOf(".", length); 195 if (endOfTextIdx < 0.7 * length) { 196 endOfTextIdx = textString.lastIndexOf(" ", length); 197 } 198 } 199 200 // add elipsis if we have shortened the article 201 if (endOfTextIdx > -1 && endOfTextIdx < textString.length()) { 202 textString = textString.substring(0, endOfTextIdx) + " (...)"; 203 } 204 205 // trim trailing spaces and return 206 return textString.trim(); 207 } 208 209 /** 210 * skips templates in wikipedia markup. Templates are enclosed within braces 211 * {}. There might be nested templates within an other template. 212 * 213 * @param pText 214 * : the wikipedia text with templates 215 * @param pIdx 216 * , pos in text to start with, MUST be a { 217 * @return the idx into the text where the template ends, or the last 218 * character in the text if it does not properly end. 219 */ 220 static int skipTemplate(String pText, int pIdx) { 221 // make sure we start with opening braces 222 if (pText.charAt(pIdx) != '{') { 223 return pIdx; 224 } 225 226 // counter for the braces we have opened, braces might be recursive 227 // we use an iterative implementation, since it is a tiny little bit 228 // faster 229 int numOpenings = 1; 230 // start with the next character 231 int idx = pIdx + 1; 232 // loop over the text starting from the next character till the end of 233 // the template or the end of the text 234 while (numOpenings > 0 && pText.length() > idx) { 235 if (pText.charAt(idx) == '{') { 236 numOpenings++; 237 } else if (pText.charAt(idx) == '}') { 238 numOpenings--; 239 } 240 idx++; 241 } 242 243 if (pIdx > idx) { 244 throw new Error("error in skip html comment for " + pText); 245 } 246 247 return idx; 248 } 249 250 /** 251 * @param pText 252 * @param pIdx 253 * , pos in text to start with, MUST be a { 254 * @return 255 */ 256 static int skipHTMLElement(String pText, int pIdx) { 257 if (pText.charAt(pIdx) != '<') { 258 return pIdx; 259 } 260 261 int numOpenings = 1; 262 int idx = pIdx + 1; 263 while (numOpenings > 0 && pText.length() > idx) { 264 if (pText.charAt(idx) == '<') { 265 numOpenings++; 266 } else if (pText.charAt(idx) == '>') { 267 numOpenings--; 268 } 269 idx++; 270 } 271 if (pIdx > idx) { 272 throw new Error("error in skip html comment for " + pText); 273 } 274 275 return idx; 276 } 277 278 /** 279 * @param pText 280 * @param pIdx, 281 * pos in text to start with, MUST be a '<' 282 * @return 283 */ 284 static int skipHTMLComment(String pText, int pIdx) { 285 if (pText.charAt(pIdx) != '<' && pText.charAt(pIdx + 1) != '!') { 286 return pIdx; 287 } 288 289 int idx = pIdx; 290 while ((idx = pText.indexOf('-', idx)) > -1) { 291 if (pText.length() < idx + 2) { 292 return pText.length(); 293 } 294 if (pText.charAt(idx) == '-' && pText.charAt(idx + 1) == '-' && pText.charAt(idx + 2) == '>') { 295 return idx + 3; 296 } 297 idx++; 298 } 299 if (pIdx > idx) { 300 throw new Error("error in skip html comment for " + pText); 301 } 302 return idx; 303 } 304 305 private static String removeIndentAtBeginning(String pText) { 306 pText = pText.trim(); 307 if (pText.startsWith(":")) { 308 int lineFeed = pText.indexOf("\n"); 309 if (lineFeed > -1) { 310 pText = pText.substring(lineFeed + 1); 311 } else { 312 // we may already have removed the linefeed 313 // check for italics 314 if (pText.startsWith(":''")) { 315 int italic = pText.indexOf("''", 3); 316 if (italic > -1) { 317 pText = pText.substring(italic + 2); 318 } 319 } 320 } 321 } 322 return pText; 323 } 324 325 private static int findEndOfLink(String pText, int pIdx) { 326 int end = pText.indexOf("]]", pIdx); 327 if (end == -1) { 328 return pIdx; 329 } 330 331 int idx = pIdx; 332 int openingIdx = pText.indexOf("[[", idx + 2); 333 while (openingIdx > -1 && openingIdx < end) { 334 idx = end; 335 end = pText.indexOf("]]", end + 2); 336 openingIdx = pText.indexOf("[[", idx); 337 } 338 if (end != -1) { 339 idx = end; 340 } 341 if (pIdx > idx) { 342 throw new Error("error in skip html comment for " + pText); 343 } 344 345 return idx; 346 } 347 348 /** 349 * removes sequences of whitespace and keeps only one whitespace character 350 * 351 * @param pString 352 * @return 353 */ 354 public static String removeWhiteSpace(String pString) { 355 StringBuffer buf = new StringBuffer(); 356 char[] chars = pString.toCharArray(); 357 int counter = 0; 358 for (int i = 0; i < chars.length; i++) { 359 if (chars[i] == ' ') { 360 if (counter == 0) { 361 buf.append(chars[i]); 362 } 363 counter++; 364 } else { 365 buf.append(chars[i]); 366 counter = 0; 367 } 368 } 369 return buf.toString(); 370 } 371 372 public static String removeBold(String pString) { 373 return pString.replaceAll("'''", ""); 374 } 375 376 public static String removeItalic(String pString) { 377 return pString.replaceAll("''", ""); 378 } 379 380}