001 /*
002 * Copyright 2007 Marc Wick, geonames.org
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 * http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017 package org.geonames.wikipedia;
018
019 /**
020 * @author marc
021 *
022 */
023 public class TextSummaryExtractor {
024
025 /**
026 * extract a text summary from a wikipedia article. The wikipedia markup is
027 * removed. The length of the summary is equal or lower the length of the
028 * parameter. The extractor tries to end the summary at a fullstop. It stops
029 * at a new paragraph.
030 *
031 * @param pText
032 * @param length
033 * @return
034 */
035 public static String extractSummary(String pText, int length) {
036 return extractSummary(pText, length, true);
037 }
038
039 public static String extractSummary(String pText, int length,
040 boolean stopAtParagraph) {
041 if (pText == null) {
042 return null;
043 }
044
045 // remove all wikipedia markup (paragraphs are kept)
046 //
047 StringBuilder summary = new StringBuilder();
048 int idx = 0;
049
050 // loop over all characters in input string
051 while (idx > -1 && (summary.length() < 50 + 2 * length || length == 0)
052 && idx < pText.length()) {
053 // get next chacter
054 char c = pText.charAt(idx);
055
056 if (c == '{') {
057 // skip template and set idx to end of template
058 int endidx = skipTemplate(pText, idx);
059 // do we have an audio template?
060 if (pText.toLowerCase().indexOf("{{audio") == idx) {
061 int begLabelIdx = pText.lastIndexOf("|", endidx);
062 if (begLabelIdx > -1) {
063 String label = pText.substring(begLabelIdx + 1,
064 endidx - 2).trim();
065 summary.append(label);
066 }
067 }
068 // skip template and set idx to end of template
069 idx = endidx;
070 continue;
071 } else if (c == '<') {
072 // is it a html comment
073 if (pText.length() > idx + 1 && pText.charAt(idx + 1) == '!') {
074 // skip html comment
075 idx = skipHTMLComment(pText, idx);
076 continue;
077 } else {
078 // html element starts here, skip it, set idx to end of html
079 // element
080 idx = skipHTMLElement(pText, idx);
081 continue;
082 }
083 } else if (c == '[') {
084
085 // look ahead to see whether we have a link
086 if (pText.charAt(idx + 1) == '[') {
087 // we have two square brackets "[[" (link)
088
089 // get the end of the double square bracket
090 int endOfLink = pText.indexOf("]]", idx);
091
092 // image link ?
093 int colon = pText.indexOf(":", idx);
094 if (colon > -1 && colon < endOfLink) {
095 // image link contains a caption which might contain
096 // a link within the link
097 idx = findEndOfLink(pText, idx);
098 continue;
099 }
100
101 int beginAnchor = pText.indexOf("|", idx);
102 if (beginAnchor > -1 && beginAnchor < endOfLink) {
103 idx = beginAnchor + 1;
104 } else {
105 idx = idx + 2;
106 }
107 continue;
108 } else {
109 // next character is not a square brackets and thus a
110 // reference link to be removed
111 // get the end of the square bracket
112 int endOfLink = pText.indexOf("]", idx);
113 if (endOfLink > -1) {
114 idx = endOfLink + 1;
115 continue;
116 }
117 }
118 } else if (c == ']') {
119 // look ahead
120 if (idx + 1 < pText.length() && pText.charAt(idx + 1) == ']') {
121 idx = idx + 2;
122 continue;
123 }
124 }
125
126 summary.append(c);
127 idx++;
128 }
129
130 String textString = removeIndentAtBeginning(summary.toString());
131 // remove empty parenthesis
132 textString = textString.replaceAll("\\([^\\w]*\\)", "");
133 // remove comma in front of parenthesis
134 textString = textString.replaceAll("\\([, ]*", "(");
135 textString = textString.replaceAll("[, ]*\\)", ")");
136
137 textString = removeWhiteSpace(
138 textString.replaceAll("\r", " ").replaceAll("\n", " ")
139 .replaceAll("\t", " ")).trim();
140 textString = removeBold(textString);
141 textString = removeItalic(textString);
142
143 // convert 'non breaking html spaces' into blanks. But preserve them
144 // (don't remove white space)
145 textString = textString.replaceAll(" ", " ");
146 textString = textString.replaceAll("\\( ", "(");
147 textString = textString.replaceAll(" \\)", ")");
148
149 // find full stop near length of text
150 int endOfTextIdx = textString.length();
151
152 if (stopAtParagraph) {
153 // only look at first paragraph for summary
154 int paragraph = textString.indexOf("==");
155 if (paragraph > 10) {
156 endOfTextIdx = paragraph;
157 }
158 }
159
160 //
161 if (endOfTextIdx < 20 || endOfTextIdx > length) {
162 endOfTextIdx = textString.lastIndexOf(".", length);
163 if (endOfTextIdx < 0.7 * length) {
164 endOfTextIdx = textString.lastIndexOf(" ", length);
165 }
166 }
167
168 // add elipsis if we have shortened the article
169 if (endOfTextIdx > -1 && endOfTextIdx < textString.length()) {
170 textString = textString.substring(0, endOfTextIdx) + " (...)";
171 }
172
173 // trim trailing spaces and return
174 return textString.trim();
175 }
176
177 /**
178 * skips templates in wikipedia markup. Templates are enclosed within braces
179 * {}. There might be nested templates within an other template.
180 *
181 * @param pText
182 * : the wikipedia text with templates
183 * @param pIdx
184 * , pos in text to start with, MUST be a {
185 * @return the idx into the text where the template ends, or the last
186 * character in the text if it does not properly end.
187 */
188 static int skipTemplate(String pText, int pIdx) {
189 // make sure we start with opening braces
190 if (pText.charAt(pIdx) != '{') {
191 return pIdx;
192 }
193
194 // counter for the braces we have opened, braces might be recursive
195 // we use an iterative implementation, since it is a tiny little bit
196 // faster
197 int numOpenings = 1;
198 // start with the next character
199 int idx = pIdx + 1;
200 // loop over the text starting from the next character till the end of
201 // the template or the end of the text
202 while (numOpenings > 0 && pText.length() > idx) {
203 if (pText.charAt(idx) == '{') {
204 numOpenings++;
205 } else if (pText.charAt(idx) == '}') {
206 numOpenings--;
207 }
208 idx++;
209 }
210 return idx;
211 }
212
213 /**
214 * @param pText
215 * @param pIdx
216 * , pos in text to start with, MUST be a {
217 * @return
218 */
219 static int skipHTMLElement(String pText, int pIdx) {
220 if (pText.charAt(pIdx) != '<') {
221 return pIdx;
222 }
223
224 int numOpenings = 1;
225 int idx = pIdx + 1;
226 while (numOpenings > 0 && pText.length() > idx) {
227 if (pText.charAt(idx) == '<') {
228 numOpenings++;
229 } else if (pText.charAt(idx) == '>') {
230 numOpenings--;
231 }
232 idx++;
233 }
234 return idx;
235 }
236
237 /**
238 * @param pText
239 * @param pIdx,
240 * pos in text to start with, MUST be a '<'
241 * @return
242 */
243 static int skipHTMLComment(String pText, int pIdx) {
244 if (pText.charAt(pIdx) != '<' && pText.charAt(pIdx + 1) != '!') {
245 return pIdx;
246 }
247
248 int idx = pIdx;
249 while ((idx = pText.indexOf('-', idx)) > -1) {
250 if (pText.length() < idx + 2) {
251 return pText.length();
252 }
253 if (pText.charAt(idx) == '-' && pText.charAt(idx + 1) == '-'
254 && pText.charAt(idx + 2) == '>') {
255 return idx + 3;
256 }
257 idx++;
258 }
259 return idx;
260 }
261
262 private static String removeIndentAtBeginning(String pText) {
263 pText = pText.trim();
264 if (pText.startsWith(":")) {
265 int lineFeed = pText.indexOf("\n");
266 if (lineFeed > -1) {
267 pText = pText.substring(lineFeed + 1);
268 } else {
269 // we may already have removed the linefeed
270 // check for italics
271 if (pText.startsWith(":''")) {
272 int italic = pText.indexOf("''", 3);
273 if (italic > -1) {
274 pText = pText.substring(italic + 2);
275 }
276 }
277 }
278 }
279 return pText;
280 }
281
282 private static int findEndOfLink(String pText, int pIdx) {
283 int end = pText.indexOf("]]", pIdx);
284 if (end == -1) {
285 return pIdx;
286 }
287
288 int idx = pIdx;
289 int openingIdx = pText.indexOf("[[", idx + 2);
290 while (openingIdx > -1 && openingIdx < end) {
291 idx = end;
292 end = pText.indexOf("]]", end + 2);
293 openingIdx = pText.indexOf("[[", idx);
294 }
295 if (end != -1) {
296 idx = end;
297 }
298 return idx;
299 }
300
301 /**
302 * removes sequences of whitespace and keeps only one whitespace character
303 *
304 * @param pString
305 * @return
306 */
307 public static String removeWhiteSpace(String pString) {
308 StringBuffer buf = new StringBuffer();
309 char[] chars = pString.toCharArray();
310 int counter = 0;
311 for (int i = 0; i < chars.length; i++) {
312 if (chars[i] == ' ') {
313 if (counter == 0) {
314 buf.append(chars[i]);
315 }
316 counter++;
317 } else {
318 buf.append(chars[i]);
319 counter = 0;
320 }
321 }
322 return buf.toString();
323 }
324
325 public static String removeBold(String pString) {
326 return pString.replaceAll("'''", "");
327 }
328
329 public static String removeItalic(String pString) {
330 return pString.replaceAll("''", "");
331 }
332
333 }