001    /*
002     * Copyright 2007 Marc Wick, geonames.org
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *     http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     *
016     */
017    package org.geonames.wikipedia;
018    
019    /**
020     * @author marc
021     * 
022     */
023    public class TextSummaryExtractor {
024    
025            /**
026             * extract a text summary from a wikipedia article. The wikipedia markup is
027             * removed. The length of the summary is equal or lower the length of the
028             * parameter. The extractor tries to end the summary at a fullstop. It stops
029             * at a new paragraph.
030             * 
031             * @param pText
032             * @param length
033             * @return
034             */
035            public static String extractSummary(String pText, int length) {
036                    return extractSummary(pText, length, true);
037            }
038    
039            public static String extractSummary(String pText, int length,
040                            boolean stopAtParagraph) {
041                    if (pText == null) {
042                            return null;
043                    }
044    
045                    // remove all wikipedia markup (paragraphs are kept)
046                    //
047                    StringBuilder summary = new StringBuilder();
048                    int idx = 0;
049    
050                    // loop over all characters in input string
051                    while (idx > -1 && (summary.length() < 50 + 2 * length || length == 0)
052                                    && idx < pText.length()) {
053                            // get next chacter
054                            char c = pText.charAt(idx);
055    
056                            if (c == '{') {
057                                    // skip template and set idx to end of template
058                                    int endidx = skipTemplate(pText, idx);
059                                    // do we have an audio template?
060                                    if (pText.toLowerCase().indexOf("{{audio") == idx) {
061                                            int begLabelIdx = pText.lastIndexOf("|", endidx);
062                                            if (begLabelIdx > -1) {
063                                                    String label = pText.substring(begLabelIdx + 1,
064                                                                    endidx - 2).trim();
065                                                    summary.append(label);
066                                            }
067                                    }
068                                    // skip template and set idx to end of template
069                                    idx = endidx;
070                                    continue;
071                            } else if (c == '<') {
072                                    // is it a html comment
073                                    if (pText.length() > idx + 1 && pText.charAt(idx + 1) == '!') {
074                                            // skip html comment
075                                            idx = skipHTMLComment(pText, idx);
076                                            continue;
077                                    } else {
078                                            // html element starts here, skip it, set idx to end of html
079                                            // element
080                                            idx = skipHTMLElement(pText, idx);
081                                            continue;
082                                    }
083                            } else if (c == '[') {
084    
085                                    // look ahead to see whether we have a link
086                                    if (pText.charAt(idx + 1) == '[') {
087                                            // we have two square brackets "[[" (link)
088    
089                                            // get the end of the double square bracket
090                                            int endOfLink = pText.indexOf("]]", idx);
091    
092                                            // image link ?
093                                            int colon = pText.indexOf(":", idx);
094                                            if (colon > -1 && colon < endOfLink) {
095                                                    // image link contains a caption which might contain
096                                                    // a link within the link
097                                                    idx = findEndOfLink(pText, idx);
098                                                    continue;
099                                            }
100    
101                                            int beginAnchor = pText.indexOf("|", idx);
102                                            if (beginAnchor > -1 && beginAnchor < endOfLink) {
103                                                    idx = beginAnchor + 1;
104                                            } else {
105                                                    idx = idx + 2;
106                                            }
107                                            continue;
108                                    } else {
109                                            // next character is not a square brackets and thus a
110                                            // reference link to be removed
111                                            // get the end of the square bracket
112                                            int endOfLink = pText.indexOf("]", idx);
113                                            if (endOfLink > -1) {
114                                                    idx = endOfLink + 1;
115                                                    continue;
116                                            }
117                                    }
118                            } else if (c == ']') {
119                                    // look ahead
120                                    if (idx + 1 < pText.length() && pText.charAt(idx + 1) == ']') {
121                                            idx = idx + 2;
122                                            continue;
123                                    }
124                            }
125    
126                            summary.append(c);
127                            idx++;
128                    }
129    
130                    String textString = removeIndentAtBeginning(summary.toString());
131                    // remove empty parenthesis
132                    textString = textString.replaceAll("\\([^\\w]*\\)", "");
133                    // remove comma in front of parenthesis
134                    textString = textString.replaceAll("\\([, ]*", "(");
135                    textString = textString.replaceAll("[, ]*\\)", ")");
136    
137                    textString = removeWhiteSpace(
138                                    textString.replaceAll("\r", " ").replaceAll("\n", " ")
139                                                    .replaceAll("\t", " ")).trim();
140                    textString = removeBold(textString);
141                    textString = removeItalic(textString);
142    
143                    // convert 'non breaking html spaces' into blanks. But preserve them
144                    // (don't remove white space)
145                    textString = textString.replaceAll("&nbsp;", " ");
146                    textString = textString.replaceAll("\\( ", "(");
147                    textString = textString.replaceAll(" \\)", ")");
148    
149                    // find full stop near length of text
150                    int endOfTextIdx = textString.length();
151    
152                    if (stopAtParagraph) {
153                            // only look at first paragraph for summary
154                            int paragraph = textString.indexOf("==");
155                            if (paragraph > 10) {
156                                    endOfTextIdx = paragraph;
157                            }
158                    }
159    
160                    // 
161                    if (endOfTextIdx < 20 || endOfTextIdx > length) {
162                            endOfTextIdx = textString.lastIndexOf(".", length);
163                            if (endOfTextIdx < 0.7 * length) {
164                                    endOfTextIdx = textString.lastIndexOf(" ", length);
165                            }
166                    }
167    
168                    // add elipsis if we have shortened the article
169                    if (endOfTextIdx > -1 && endOfTextIdx < textString.length()) {
170                            textString = textString.substring(0, endOfTextIdx) + " (...)";
171                    }
172    
173                    // trim trailing spaces and return
174                    return textString.trim();
175            }
176    
177            /**
178             * skips templates in wikipedia markup. Templates are enclosed within braces
179             * {}. There might be nested templates within an other template.
180             * 
181             * @param pText
182             *            : the wikipedia text with templates
183             * @param pIdx
184             *            , pos in text to start with, MUST be a {
185             * @return the idx into the text where the template ends, or the last
186             *         character in the text if it does not properly end.
187             */
188            static int skipTemplate(String pText, int pIdx) {
189                    // make sure we start with opening braces
190                    if (pText.charAt(pIdx) != '{') {
191                            return pIdx;
192                    }
193    
194                    // counter for the braces we have opened, braces might be recursive
195                    // we use an iterative implementation, since it is a tiny little bit
196                    // faster
197                    int numOpenings = 1;
198                    // start with the next character
199                    int idx = pIdx + 1;
200                    // loop over the text starting from the next character till the end of
201                    // the template or the end of the text
202                    while (numOpenings > 0 && pText.length() > idx) {
203                            if (pText.charAt(idx) == '{') {
204                                    numOpenings++;
205                            } else if (pText.charAt(idx) == '}') {
206                                    numOpenings--;
207                            }
208                            idx++;
209                    }
210                    return idx;
211            }
212    
213            /**
214             * @param pText
215             * @param pIdx
216             *            , pos in text to start with, MUST be a {
217             * @return
218             */
219            static int skipHTMLElement(String pText, int pIdx) {
220                    if (pText.charAt(pIdx) != '<') {
221                            return pIdx;
222                    }
223    
224                    int numOpenings = 1;
225                    int idx = pIdx + 1;
226                    while (numOpenings > 0 && pText.length() > idx) {
227                            if (pText.charAt(idx) == '<') {
228                                    numOpenings++;
229                            } else if (pText.charAt(idx) == '>') {
230                                    numOpenings--;
231                            }
232                            idx++;
233                    }
234                    return idx;
235            }
236    
237    /**
238             * @param pText
239             * @param pIdx,
240             *            pos in text to start with, MUST be a '<'
241             * @return
242             */
243            static int skipHTMLComment(String pText, int pIdx) {
244                    if (pText.charAt(pIdx) != '<' && pText.charAt(pIdx + 1) != '!') {
245                            return pIdx;
246                    }
247    
248                    int idx = pIdx;
249                    while ((idx = pText.indexOf('-', idx)) > -1) {
250                            if (pText.length() < idx + 2) {
251                                    return pText.length();
252                            }
253                            if (pText.charAt(idx) == '-' && pText.charAt(idx + 1) == '-'
254                                            && pText.charAt(idx + 2) == '>') {
255                                    return idx + 3;
256                            }
257                            idx++;
258                    }
259                    return idx;
260            }
261    
262            private static String removeIndentAtBeginning(String pText) {
263                    pText = pText.trim();
264                    if (pText.startsWith(":")) {
265                            int lineFeed = pText.indexOf("\n");
266                            if (lineFeed > -1) {
267                                    pText = pText.substring(lineFeed + 1);
268                            } else {
269                                    // we may already have removed the linefeed
270                                    // check for italics
271                                    if (pText.startsWith(":''")) {
272                                            int italic = pText.indexOf("''", 3);
273                                            if (italic > -1) {
274                                                    pText = pText.substring(italic + 2);
275                                            }
276                                    }
277                            }
278                    }
279                    return pText;
280            }
281    
282            private static int findEndOfLink(String pText, int pIdx) {
283                    int end = pText.indexOf("]]", pIdx);
284                    if (end == -1) {
285                            return pIdx;
286                    }
287    
288                    int idx = pIdx;
289                    int openingIdx = pText.indexOf("[[", idx + 2);
290                    while (openingIdx > -1 && openingIdx < end) {
291                            idx = end;
292                            end = pText.indexOf("]]", end + 2);
293                            openingIdx = pText.indexOf("[[", idx);
294                    }
295                    if (end != -1) {
296                            idx = end;
297                    }
298                    return idx;
299            }
300    
301            /**
302             * removes sequences of whitespace and keeps only one whitespace character
303             * 
304             * @param pString
305             * @return
306             */
307            public static String removeWhiteSpace(String pString) {
308                    StringBuffer buf = new StringBuffer();
309                    char[] chars = pString.toCharArray();
310                    int counter = 0;
311                    for (int i = 0; i < chars.length; i++) {
312                            if (chars[i] == ' ') {
313                                    if (counter == 0) {
314                                            buf.append(chars[i]);
315                                    }
316                                    counter++;
317                            } else {
318                                    buf.append(chars[i]);
319                                    counter = 0;
320                            }
321                    }
322                    return buf.toString();
323            }
324    
325            public static String removeBold(String pString) {
326                    return pString.replaceAll("'''", "");
327            }
328    
329            public static String removeItalic(String pString) {
330                    return pString.replaceAll("''", "");
331            }
332    
333    }