Source code

001/*
002 * Copyright 2007 Marc Wick, geonames.org
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 *
016 */
017package org.geonames.wikipedia;
018
019/**
020 * @author marc
021 * 
022 */
023public class TextSummaryExtractor {
024
025        /**
026         * extract a text summary from a wikipedia article. The wikipedia markup is
027         * removed. The length of the summary is equal or lower the length of the
028         * parameter. The extractor tries to end the summary at a fullstop. It stops
029         * at a new paragraph.
030         * 
031         * @param pText
032         * @param length
033         * @return
034         */
035        public static String extractSummary(String pText, int length) {
036                return extractSummary(pText, length, true);
037        }
038
039        public static String extractSummary(String pText, int length, boolean stopAtParagraph) {
040                try {
041                        return _extractSummary(pText, length, stopAtParagraph);
042                } catch (Error e) {
043                        System.err.println(e.getMessage());
044                        return null;
045                }
046        }
047
048        private static String _extractSummary(String pText, int length, boolean stopAtParagraph) {
049                if (pText == null) {
050                        return null;
051                }
052
053                String textCopy = new String(pText);
054
055                // remove all wikipedia markup (paragraphs are kept)
056                //
057                StringBuilder summary = new StringBuilder();
058                int idx = 0;
059                int cnt = 0;
060
061                // loop over all characters in input string
062                while (idx > -1 && (summary.length() < 50 + 2 * length || length == 0) && idx < textCopy.length()) {
063                        // get next chacter
064                        cnt++;
065                        char c = textCopy.charAt(idx);
066
067                        if (c == '{') {
068                                // skip template and set idx to end of template
069                                int endidx = skipTemplate(textCopy, idx);
070                                // do we have an audio template?
071                                if (textCopy.toLowerCase().indexOf("{{audio") == idx) {
072                                        int begLabelIdx = textCopy.lastIndexOf("|", endidx);
073                                        if (begLabelIdx > -1) {
074                                                String label = textCopy.substring(begLabelIdx + 1, endidx - 2).trim();
075                                                summary.append(label);
076                                        }
077                                }
078                                if (textCopy.toLowerCase().indexOf("{{formatnum") == idx) {
079                                        int begLabelIdx = textCopy.indexOf(":", idx);
080                                        if (begLabelIdx > -1) {
081
082                                                int endLabelIdx = endidx - 2;
083                                                if (textCopy.indexOf("|", begLabelIdx) > -1) {
084                                                        endLabelIdx = textCopy.indexOf("|", begLabelIdx);
085                                                }
086
087                                                String label = textCopy.substring(begLabelIdx + 1, endLabelIdx).trim();
088                                                summary.append(label);
089                                        }
090                                }
091                                // skip template and set idx to end of template
092                                idx = endidx;
093                                continue;
094                        } else if (c == '<') {
095                                // is it a html comment
096                                if (textCopy.length() > idx + 1 && textCopy.charAt(idx + 1) == '!') {
097                                        // skip html comment
098                                        idx = skipHTMLComment(textCopy, idx);
099                                        continue;
100                                } else {
101                                        // html element starts here, skip it, set idx to end of html
102                                        // element
103                                        idx = skipHTMLElement(textCopy, idx);
104                                        continue;
105                                }
106                        } else if (c == '[') {
107
108                                // look ahead to see whether we have a link
109                                if (textCopy.charAt(idx + 1) == '[') {
110                                        // we have two square brackets "[[" (link)
111
112                                        // get the end of the double square bracket
113                                        int endOfLink = textCopy.indexOf("]]", idx);
114
115                                        // image link ?
116                                        int colon = textCopy.indexOf(":", idx);
117                                        if (colon > -1 && colon < endOfLink) {
118                                                // image link contains a caption which might contain
119                                                // a link within the link
120                                                idx = findEndOfLink(textCopy, idx);
121                                                continue;
122                                        }
123
124                                        int beginAnchor = textCopy.indexOf("|", idx);
125                                        if (beginAnchor > -1 && beginAnchor < endOfLink) {
126                                                idx = beginAnchor + 1;
127                                        } else {
128                                                idx = idx + 2;
129                                        }
130                                        continue;
131                                } else {
132                                        // next character is not a square brackets and thus a
133                                        // reference link to be removed
134                                        // get the end of the square bracket
135                                        int endOfLink = textCopy.indexOf("]", idx);
136                                        if (endOfLink > -1) {
137                                                idx = endOfLink + 1;
138                                                continue;
139                                        }
140                                }
141                        } else if (c == ']') {
142                                // look ahead
143                                if (idx + 1 < textCopy.length() && textCopy.charAt(idx + 1) == ']') {
144                                        idx = idx + 2;
145                                        continue;
146                                }
147                        } else if (c == '=') {
148                                // look ahead
149                                if (idx + 1 < textCopy.length() && textCopy.charAt(idx + 1) == '=') {
150                                        int endHeaderIdx = textCopy.indexOf("==", idx + 2);
151                                        if (endHeaderIdx > -1) {
152                                                idx = endHeaderIdx + 2;
153                                                continue;
154                                        }
155                                }
156                        }
157
158                        summary.append(c);
159                        idx++;
160                }
161
162                String textString = removeIndentAtBeginning(summary.toString());
163                // remove empty parenthesis
164                textString = textString.replaceAll("\\([^\\w]*\\)", "");
165                // remove comma in front of parenthesis
166                textString = textString.replaceAll("\\([, ]*", "(");
167                textString = textString.replaceAll("[, ]*\\)", ")");
168
169                textString = removeWhiteSpace(textString.replaceAll("\r", " ").replaceAll("\n", " ").replaceAll("\t", " "))
170                                .trim();
171
172                textString = removeBold(textString);
173                textString = removeItalic(textString);
174
175                // convert 'non breaking html spaces' into blanks. But preserve them
176                // (don't remove white space)
177                textString = textString.replaceAll("&nbsp;", " ");
178                textString = textString.replaceAll("\\( ", "(");
179                textString = textString.replaceAll(" \\)", ")");
180
181                // find full stop near length of text
182                int endOfTextIdx = textString.length();
183
184                if (stopAtParagraph) {
185                        // only look at first paragraph for summary
186                        int paragraph = textString.indexOf("==");
187                        if (paragraph > 10) {
188                                endOfTextIdx = paragraph;
189                        }
190                }
191
192                //
193                if (endOfTextIdx < 20 || endOfTextIdx > length) {
194                        endOfTextIdx = textString.lastIndexOf(".", length);
195                        if (endOfTextIdx < 0.7 * length) {
196                                endOfTextIdx = textString.lastIndexOf(" ", length);
197                        }
198                }
199
200                // add elipsis if we have shortened the article
201                if (endOfTextIdx > -1 && endOfTextIdx < textString.length()) {
202                        textString = textString.substring(0, endOfTextIdx) + " (...)";
203                }
204
205                // trim trailing spaces and return
206                return textString.trim();
207        }
208
209        /**
210         * skips templates in wikipedia markup. Templates are enclosed within braces
211         * {}. There might be nested templates within an other template.
212         * 
213         * @param pText
214         *            : the wikipedia text with templates
215         * @param pIdx
216         *            , pos in text to start with, MUST be a {
217         * @return the idx into the text where the template ends, or the last
218         *         character in the text if it does not properly end.
219         */
220        static int skipTemplate(String pText, int pIdx) {
221                // make sure we start with opening braces
222                if (pText.charAt(pIdx) != '{') {
223                        return pIdx;
224                }
225
226                // counter for the braces we have opened, braces might be recursive
227                // we use an iterative implementation, since it is a tiny little bit
228                // faster
229                int numOpenings = 1;
230                // start with the next character
231                int idx = pIdx + 1;
232                // loop over the text starting from the next character till the end of
233                // the template or the end of the text
234                while (numOpenings > 0 && pText.length() > idx) {
235                        if (pText.charAt(idx) == '{') {
236                                numOpenings++;
237                        } else if (pText.charAt(idx) == '}') {
238                                numOpenings--;
239                        }
240                        idx++;
241                }
242
243                if (pIdx > idx) {
244                        throw new Error("error in skip html comment for " + pText);
245                }
246
247                return idx;
248        }
249
250        /**
251         * @param pText
252         * @param pIdx
253         *            , pos in text to start with, MUST be a {
254         * @return
255         */
256        static int skipHTMLElement(String pText, int pIdx) {
257                if (pText.charAt(pIdx) != '<') {
258                        return pIdx;
259                }
260
261                int numOpenings = 1;
262                int idx = pIdx + 1;
263                while (numOpenings > 0 && pText.length() > idx) {
264                        if (pText.charAt(idx) == '<') {
265                                numOpenings++;
266                        } else if (pText.charAt(idx) == '>') {
267                                numOpenings--;
268                        }
269                        idx++;
270                }
271                if (pIdx > idx) {
272                        throw new Error("error in skip html comment for " + pText);
273                }
274
275                return idx;
276        }
277
278        /**
279         * @param pText
280         * @param pIdx,
281         *            pos in text to start with, MUST be a '<'
282         * @return
283         */
284        static int skipHTMLComment(String pText, int pIdx) {
285                if (pText.charAt(pIdx) != '<' && pText.charAt(pIdx + 1) != '!') {
286                        return pIdx;
287                }
288
289                int idx = pIdx;
290                while ((idx = pText.indexOf('-', idx)) > -1) {
291                        if (pText.length() < idx + 2) {
292                                return pText.length();
293                        }
294                        if (pText.charAt(idx) == '-' && pText.charAt(idx + 1) == '-' && pText.charAt(idx + 2) == '>') {
295                                return idx + 3;
296                        }
297                        idx++;
298                }
299                if (pIdx > idx) {
300                        throw new Error("error in skip html comment for " + pText);
301                }
302                return idx;
303        }
304
305        private static String removeIndentAtBeginning(String pText) {
306                pText = pText.trim();
307                if (pText.startsWith(":")) {
308                        int lineFeed = pText.indexOf("\n");
309                        if (lineFeed > -1) {
310                                pText = pText.substring(lineFeed + 1);
311                        } else {
312                                // we may already have removed the linefeed
313                                // check for italics
314                                if (pText.startsWith(":''")) {
315                                        int italic = pText.indexOf("''", 3);
316                                        if (italic > -1) {
317                                                pText = pText.substring(italic + 2);
318                                        }
319                                }
320                        }
321                }
322                return pText;
323        }
324
325        private static int findEndOfLink(String pText, int pIdx) {
326                int end = pText.indexOf("]]", pIdx);
327                if (end == -1) {
328                        return pIdx;
329                }
330
331                int idx = pIdx;
332                int openingIdx = pText.indexOf("[[", idx + 2);
333                while (openingIdx > -1 && openingIdx < end) {
334                        idx = end;
335                        end = pText.indexOf("]]", end + 2);
336                        openingIdx = pText.indexOf("[[", idx);
337                }
338                if (end != -1) {
339                        idx = end;
340                }
341                if (pIdx > idx) {
342                        throw new Error("error in skip html comment for " + pText);
343                }
344
345                return idx;
346        }
347
348        /**
349         * removes sequences of whitespace and keeps only one whitespace character
350         * 
351         * @param pString
352         * @return
353         */
354        public static String removeWhiteSpace(String pString) {
355                StringBuffer buf = new StringBuffer();
356                char[] chars = pString.toCharArray();
357                int counter = 0;
358                for (int i = 0; i < chars.length; i++) {
359                        if (chars[i] == ' ') {
360                                if (counter == 0) {
361                                        buf.append(chars[i]);
362                                }
363                                counter++;
364                        } else {
365                                buf.append(chars[i]);
366                                counter = 0;
367                        }
368                }
369                return buf.toString();
370        }
371
372        public static String removeBold(String pString) {
373                return pString.replaceAll("'''", "");
374        }
375
376        public static String removeItalic(String pString) {
377                return pString.replaceAll("''", "");
378        }
379
380}