Difference between revisions of "MediaWiki:TextCleaner.js"

From radlines.org
Jump to navigation Jump to search
(Created page with "/** Wikitext sanitation for MediaWiki Author: User:Lupo, January 2008 License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0) Choose...")
 
 
(No difference)

Latest revision as of 19:02, 27 May 2018

/** Wikitext sanitation for MediaWiki

Author: User:Lupo, January 2008 License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)

Choose whichever license of these you like best :-)

  • /

// /* global mw:false, TextCleaner:true */ /* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */ /* jshint curly:false, eqnull:true, laxbreak:true */ (function () { 'use strict'; window.TextCleaner = { imgNamespaceNames: null, // This function attempts to construct well-formed wikitext from input that may contain // possibly broken wikitext. // // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence // of templates, and due to the fact that image thumbnail captions may themselves contain // links. This implementation catches the most common errors (such as forgetting to close a // template or a link), and even some more elaborate ones. With enough malice, this sanitation // can still be broken by user input such that the result is not well-formed wikitext as the // parser at the servers would like to have it. (It's still possible that the result is broken // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext // into broken wikitext.) // // If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original // image link was a thumbnail or had a width smaller than 300px specified. // // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is // probably rather inefficient due to the many substrings that are generated. This function is // primarily intended to be used to clean up user input in forms, which are typically rather // short. sanitizeWikiText: function (input, only_thumbs) { if (input.search(/[\][}{]|<nowiki(\s[^>]*)?>|<!--/) < 0) return input; // No critical characters if (!TextCleaner.imgNamespaceNames) { TextCleaner.imgNamespaceNames = []; var namespaceIds = mw.config.get('wgNamespaceIds'); if (namespaceIds) { for (var name in namespaceIds) { if (namespaceIds[name] === 6) // Image namespace TextCleaner.imgNamespaceNames.push(name); } } // Make sure that we have the two canonical names TextCleaner.imgNamespaceNames.push('Image'); TextCleaner.imgNamespaceNames.push('File'); // If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here! } var consumed = [0, 0]; // For image captions. Image caption may contain links, and may even contain images. // The current MediaWiki parser actually allows this only once. For deeper recursions, // it fails. But here, it's actually easier to implement no limit. var base_regexp = new RegExp('[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]' + '|<nowiki(\\s[^>]*)?>|<!--', 'i'); // Ignore case var nowiki_regexp = new RegExp('<\\/nowiki(\\s[^>]*)?>|<!--', 'i'); var allow_only_thumbs = only_thumbs; function sanitize(s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) { if (!s || !s.length) { if (caption_level > 0) { if (consumed.length < caption_level) consumed.push(0); else consumed[caption_level - 1] = 0; } return s; } var result = ''; var ch = ''; var initial_length = s.length; var get_out = false; var in_nowiki = false; var endings = null; // Stack recording template and table nesting var next; var regexp = base_regexp; function push_end(val) { if (!endings) endings = [val]; else endings.push(val); } function pop_end() { if (!endings) return null; // Shouldn't happen var result; if (endings.length === 1) { result = endings[0]; endings = null; } else { result = endings[endings.length - 1]; endings.length--; } return result; } function get_initial(i, s) { for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) { if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) { var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1); if (t.toLowerCase() === (TextCleaner.imgNamespaceNames[j].toLowerCase() + ':')) return t; } } return null; } while (s.length > 0 && !get_out) { next = s.search(regexp); if (next < 0) { result += s; break; } ch = s.charAt(next); var i = -1; var j = -1; var k = -1; switch (ch) { case '<': // Nowiki or HTML comment. Must be closed. if (s.charAt(next + 1) === '!') { // HTML comment. Cannot be nested. i = s.indexOf('-->', next + 3); if (i < 0) { result += s + '-->'; s = ''; } else { result += s.substring(0, i + 3); s = s.substring(i + 3); } } else if (s.charAt(next + 1) === 'n') { // Nowiki may contain HTML comments! in_nowiki = true; regexp = nowiki_regexp; result += s.substring(0, next + 7); s = s.substring(next + 7); } else { // End of nowiki. Searched for and found only if in_nowiki === true in_nowiki = false; regexp = base_regexp; i = s.indexOf('>', next + 1); // End of tag result += s.substring(0, i + 1); s = s.substring(i + 1); } break; case '\x05': // Table start if (!with_tables) { result += s.substring(0, next); get_out = true; break; } /* fall through */ case '\x07': if (ch === '\x07' && !with_galleries) { result += s.substring(0, next); get_out = true; break; } /* fall through */ case '\x01': // Start of template, table, or gallery result += s.substring(0, next + 1); push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0)); s = s.substring(next + 1); break; case '\x06': // Table end if (break_at_pipe && !endings) { result += s.substring(0, next); get_out = true; break; } /* fall through */ case '\x02': // End of a template or table result += s.substring(0, next); if (!endings || endings[endings.length - 1] !== ch) { // Spurious template or table end if (ch === '\x02') result += '}}'; else result += '|}'; } else { result += pop_end(); } s = s.substring(next + 1); break; case '\x08': // End of gallery result += s.substring(0, next + 1); if (endings && endings[endings.length - 1] === ch) pop_end(); s = s.substring(next + 1); break; case '\x03': case '[': { if (!with_links && !endings) { get_out = true; break; } // Image links must be treated specially, since they may contain nested links // in the caption! var initial = null; // If set, it's 'image:' or 'file:' and we have an image link i = next; while (i < s.length && s.charAt(i) === ch) i++; if (ch === '\x03' && i < s.length && s.charAt(i) === '[') i++; initial = get_initial(i, s); // Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |} var lk_text = sanitize(s.substring(i), false, // No links at top-level allowed caption_level + 1, false, // No thumbs true, // Break at pipe false, // No tables false); // No galleries var lk_text_length = consumed[caption_level]; j = i + lk_text_length; if (j >= s.length) { // Used up the whole text: [[Foo or [bar if (initial && allow_only_thumbs) { // Should in any case have started with [[, not [ result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; } else { result += s.substring(0, i) + lk_text + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); } s = ''; break; } if (s.charAt(j) === '|') k = j; else k = -1; if (k < 0) { // No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar] if (initial && allow_only_thumbs) { // Should in any case have started with [[, not [ result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; } else { result += s.substring(0, i) + lk_text + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); } if (s.charAt(j) === ']' || s.charAt(j) === '\x04') { // Indeed closing the link s = s.substring(j + 1); } else { s = s.substring(j); } break; } else { var caption = null; var used = 0; // Pipe found. if (!initial) { // Not an image link. Must be something like [[Foo|Bar]]. caption = sanitize( s.substring(k + 1), // No links, please false, caption_level + 1, // No thumbs either false, // Don't care about pipes false, // Allow tables (yes, parser allows that!) true, // Allow galleries (?) true); // Now we're at [[, [, ]], or ] used = consumed[caption_level]; result += s.substring(0, i) + lk_text + '|' + caption + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); } else { var q = s.substring(k); // We assume that there are no templates, nowikis, and other nasty things // in the parameters. Search forward until the next [, {, ], } var l = q.search(/[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/); if (l < 0) l = q.length; if (l + 1 < q.length) q = q.substring(0, l + 1); var is_thumb = q.search(/\|\s*thumb(nail)?\s*[|\x04]/) >= 0; var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec(q); if (img_width && img_width.length > 1) { img_width = parseInt(img_width[1], 10); if (isNaN(img_width)) img_width = null; } else { img_width = null; } if (!img_width) img_width = is_thumb ? 180 : 301; var is_small = img_width <= 300; // Caption starts at the last pipe before l. If that is a parameter, // it doesn't hurt. var m = k + q.lastIndexOf('|', l); caption = sanitize( s.substring(m + 1), // Allow links only if it's a thumb is_thumb, caption_level + 1, allow_thumbs && is_thumb, // Don't break at pipe false, // Tables only if it's a thumb is_thumb, // Allow galleries for thumbs (?) is_thumb); used = consumed[caption_level]; // caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04' is_thumb = allow_thumbs && is_small; if (is_thumb || !allow_only_thumbs) { result += s.substring(0, i - 1) + '\x03' + lk_text; } else { result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length); } result += s.substring(k, m + 1) + caption + '\x04'; k = m; } next = k + 1 + used; if (next < s.length) { if (s.charAt(next) !== '\x04') s = s.substring(next); else s = s.substring(next + 1); } else { s = ''; } } break; } case '\x04': case ']': // Extra bracket. result += s.substring(0, next); if (!caption_level && !break_at_pipe) { result += (ch === ']' ? ']' : ']]'); s = s.substring(next + 1); } else { get_out = true; } break; case '|': result += s.substring(0, next); if (break_at_pipe && !endings) { // Pipe character at top level get_out = true; } else { if (!caption_level && !break_at_pipe && !endings) result += '|'; // Top-level pipe character else result += '|'; s = s.substring(next + 1); } break; } // end switch } // end while if (in_nowiki) result += ''; // Make sure this nowiki is closed. // Close open templates and tables while (endings) { ch = pop_end(); result += (ch === '\x06' ? '\n' : ) + ch; } if (caption_level > 0) { var used_up = initial_length - (get_out ? (s.length - next) : 0); if (consumed.length < caption_level) consumed.push(used_up); else consumed[caption_level - 1] = used_up; } return result; }

// Replace multi-character tokens by one-character placeholders, simplifying the // subsequent processing. var s = input.replace(/\{\{/g, '\x01') .replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end .replace(/\}\}/g, '\x02') .replace(/\[\[/g, '\x03') .replace(/\]\]/g, '\x04') .replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line .replace(/^\s*\{\|/, '\x05') // Table start at the very beginning .replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace) .replace(/<\s*gallery\s*>/g, '\x07') .replace(/<\/\s*gallery\s*>/g, '\x08');

s = sanitize(s, true, 0, true, false, true, true); // with links, allow thumbs, don't break at pipe, allow tables, allow galleries return s.replace(/\x01/g, '{{') .replace(/\x02/g, '}}') .replace(/\x03/g, '[[') .replace(/\x04/g, ']]') .replace(/\x05/g, '{|') .replace(/\x06/g, '|}')

.replace(/\x07/g, '

');

} }; }()); // </nowiki>