diff --git a/src/constants/string_mappings.js b/src/constants/string_mappings.js deleted file mode 100644 index 0fbaee7d74e..00000000000 --- a/src/constants/string_mappings.js +++ /dev/null @@ -1,41 +0,0 @@ -/** -* Copyright 2012-2018, Plotly, Inc. -* All rights reserved. -* -* This source code is licensed under the MIT license found in the -* LICENSE file in the root directory of this source tree. -*/ - - -'use strict'; - -// N.B. HTML entities are listed without the leading '&' and trailing ';' -// https://www.freeformatter.com/html-entities.html - -module.exports = { - entityToUnicode: { - 'mu': 'μ', - '#956': 'μ', - - 'amp': '&', - '#28': '&', - - 'lt': '<', - '#60': '<', - - 'gt': '>', - '#62': '>', - - 'nbsp': ' ', - '#160': ' ', - - 'times': '×', - '#215': '×', - - 'plusmn': '±', - '#177': '±', - - 'deg': '°', - '#176': '°' - } -}; diff --git a/src/lib/html2unicode.js b/src/lib/html2unicode.js index 2b030fa1155..a9ec30b2a4f 100644 --- a/src/lib/html2unicode.js +++ b/src/lib/html2unicode.js @@ -10,7 +10,7 @@ 'use strict'; var toSuperScript = require('superscript-text'); -var stringMappings = require('../constants/string_mappings'); +var fixEntities = require('./svg_text_utils').convertEntities; function fixSuperScript(x) { var idx = 0; @@ -33,28 +33,6 @@ function stripTags(x) { return x.replace(/\<.*\>/g, ''); } -function fixEntities(x) { - var entityToUnicode = stringMappings.entityToUnicode; - var idx = 0; - - while((idx = x.indexOf('&', idx)) >= 0) { - var nidx = x.indexOf(';', idx); - if(nidx < idx) { - idx += 1; - continue; - } - - var entity = entityToUnicode[x.slice(idx + 1, nidx)]; - if(entity) { - x = x.slice(0, idx) + entity + x.slice(nidx + 1); - } else { - x = x.slice(0, idx) + x.slice(nidx + 1); - } - } - - return x; -} - function convertHTMLToUnicode(html) { return '' + fixEntities( diff --git a/src/lib/svg_text_utils.js b/src/lib/svg_text_utils.js index fc3f718db16..b320c560183 100644 --- a/src/lib/svg_text_utils.js +++ b/src/lib/svg_text_utils.js @@ -15,7 +15,6 @@ var d3 = require('d3'); var Lib = require('../lib'); var xmlnsNamespaces = require('../constants/xmlns_namespaces'); -var stringMappings = require('../constants/string_mappings'); var LINE_SPACING = require('../constants/alignment').LINE_SPACING; // text converter @@ -223,13 +222,6 @@ var PROTOCOLS = ['http:', 'https:', 'mailto:', '', undefined, ':']; var STRIP_TAGS = new RegExp(']*)?/?>', 'g'); -var ENTITY_TO_UNICODE = Object.keys(stringMappings.entityToUnicode).map(function(k) { - return { - regExp: new RegExp('&' + k + ';', 'g'), - sub: stringMappings.entityToUnicode[k] - }; -}); - var NEWLINES = /(\r\n?|\n)/g; var SPLIT_TAGS = /(<[^<>]*>)/; @@ -254,6 +246,14 @@ var BR_TAG = //i; * * Because we hack in other attributes with style (sub & sup), drop any trailing * semicolon in user-supplied styles so we can consistently append the tag-dependent style + * + * These are for tag attributes; Chrome anyway will convert entities in + * attribute values, but not in attribute names + * you can test this by for example: + * > p = document.createElement('p') + * > p.innerHTML = 'Hi' + * > p.innerHTML + * <- 'Hi' */ var STYLEMATCH = /(^|[\s"'])style\s*=\s*("([^"]*);?"|'([^']*);?')/i; var HREFMATCH = /(^|[\s"'])href\s*=\s*("([^"]*)"|'([^']*)')/i; @@ -265,7 +265,8 @@ var POPUPMATCH = /(^|[\s"'])popup\s*=\s*("([\w=,]*)"|'([\w=,]*)')/i; function getQuotedMatch(_str, re) { if(!_str) return null; var match = _str.match(re); - return match && (match[3] || match[4]); + var result = match && (match[3] || match[4]); + return result && convertEntities(result); } var COLORMATCH = /(^|;)\s*color:/; @@ -276,19 +277,70 @@ exports.plainText = function(_str) { return (_str || '').replace(STRIP_TAGS, ' '); }; -function replaceFromMapObject(_str, list) { - if(!_str) return ''; +/* + * N.B. HTML entities are listed without the leading '&' and trailing ';' + * https://www.freeformatter.com/html-entities.html + * + * FWIW if we wanted to support the full set, it has 2261 entries: + * https://www.w3.org/TR/html5/entities.json + * though I notice that some of these are duplicates and/or are missing ";" + * eg: "&", "&", "&", and "&" all map to "&" + * We no longer need to include numeric entities here, these are now handled + * by String.fromCodePoint/fromCharCode + * + * Anyway the only ones that are really important to allow are the HTML special + * chars <, >, and &, because these ones can trigger special processing if not + * replaced by the corresponding entity. + */ +var entityToUnicode = { + mu: 'μ', + amp: '&', + lt: '<', + gt: '>', + nbsp: ' ', + times: '×', + plusmn: '±', + deg: '°' +}; - for(var i = 0; i < list.length; i++) { - var item = list[i]; - _str = _str.replace(item.regExp, item.sub); - } +// NOTE: in general entities can contain uppercase too (so [a-zA-Z]) but all the +// ones we support use only lowercase. If we ever change that, update the regex. +var ENTITY_MATCH = /&(#\d+|#x[\da-fA-F]+|[a-z]+);/g; +function convertEntities(_str) { + return _str.replace(ENTITY_MATCH, function(fullMatch, innerMatch) { + var outChar; + if(innerMatch.charAt(0) === '#') { + // cannot use String.fromCodePoint in IE + outChar = fromCodePoint( + innerMatch.charAt(1) === 'x' ? + parseInt(innerMatch.substr(2), 16) : + parseInt(innerMatch.substr(1), 10) + ); + } + else outChar = entityToUnicode[innerMatch]; - return _str; + // as in regular HTML, if we didn't decode the entity just + // leave the raw text in place. + return outChar || fullMatch; + }); } - -function convertEntities(_str) { - return replaceFromMapObject(_str, ENTITY_TO_UNICODE); +exports.convertEntities = convertEntities; + +function fromCodePoint(code) { + // Don't allow overflow. In Chrome this turns into � but I feel like it's + // more useful to just not convert it at all. + if(code > 0x10FFFF) return; + var stringFromCodePoint = String.fromCodePoint; + if(stringFromCodePoint) return stringFromCodePoint(code); + + // IE doesn't have String.fromCodePoint + // see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/fromCodePoint + var stringFromCharCode = String.fromCharCode; + if(code <= 0xFFFF) return stringFromCharCode(code); + return stringFromCharCode( + (code >> 10) + 0xD7C0, + (code % 0x400) + 0xDC00 + ); } /* @@ -302,15 +354,14 @@ function convertEntities(_str) { * somewhat differently if it does, so just keep track of this when it happens. */ function buildSVGText(containerNode, str) { - str = convertEntities(str) - /* - * Normalize behavior between IE and others wrt newlines and whitespace:pre - * this combination makes IE barf https://github.com/plotly/plotly.js/issues/746 - * Chrome and FF display \n, \r, or \r\n as a space in this mode. - * I feel like at some point we turned these into
but currently we don't so - * I'm just going to cement what we do now in Chrome and FF - */ - .replace(NEWLINES, ' '); + /* + * Normalize behavior between IE and others wrt newlines and whitespace:pre + * this combination makes IE barf https://github.com/plotly/plotly.js/issues/746 + * Chrome and FF display \n, \r, or \r\n as a space in this mode. + * I feel like at some point we turned these into
but currently we don't so + * I'm just going to cement what we do now in Chrome and FF + */ + str = str.replace(NEWLINES, ' '); var hasLink = false; @@ -435,7 +486,7 @@ function buildSVGText(containerNode, str) { newLine(); } else if(tagStyle === undefined) { - addTextNode(currentNode, parti); + addTextNode(currentNode, convertEntities(parti)); } else { // tag - open or close diff --git a/test/jasmine/tests/svg_text_utils_test.js b/test/jasmine/tests/svg_text_utils_test.js index c950f728add..4720c27369a 100644 --- a/test/jasmine/tests/svg_text_utils_test.js +++ b/test/jasmine/tests/svg_text_utils_test.js @@ -8,6 +8,16 @@ describe('svg+text utils', function() { describe('convertToTspans', function() { + var stringFromCodePoint; + + beforeAll(function() { + stringFromCodePoint = String.fromCodePoint; + }); + + afterEach(function() { + String.fromCodePoint = stringFromCodePoint; + }); + function mockTextSVGElement(txt) { return d3.select('body') .append('svg') @@ -300,16 +310,78 @@ describe('svg+text utils', function() { '100 × 20 ± 0.5 °' ); - expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °'); + expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °'); }); it('decodes some HTML entities in text (number case)', function() { var node = mockTextSVGElement( - '100μ  < 10 > 0  ' + + '100μ & < 10 > 0  ' + '100 × 20 ± 0.5 °' ); - expect(node.text()).toEqual('100μ & < 10 > 0  100 × 20 ± 0.5 °'); + expect(node.text()).toBe('100μ & < 10 > 0  100 × 20 ± 0.5 °'); + }); + + it('decodes arbitrary decimal and hex number entities', function() { + var i = 0; + for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) { + var node = mockTextSVGElement( + '&#x' + n.toString(16) + + '; = &#' + n.toString() + + '; = &#x' + n.toString(16).toUpperCase() + ';' + ); + var char = String.fromCodePoint(n); + expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n); + i++; + } + // not really necessary to assert this, but we tested 355 characters, + // weighted toward the low end but continuing all the way to the + // end of the unicode definition + expect(i).toBe(355); + }); + + it('decodes arbitrary decimal and hex number entities (IE case)', function() { + // IE does not have String.fromCodePoint + String.fromCodePoint = undefined; + expect(String.fromCodePoint).toBeUndefined(); + + var i = 0; + for(var n = 33; n < 0x10FFFF; n = Math.round(n * 1.03)) { + var node = mockTextSVGElement( + '&#x' + n.toString(16) + + '; = &#' + n.toString() + + '; = &#x' + n.toString(16).toUpperCase() + ';' + ); + var char = stringFromCodePoint(n); + expect(node.text()).toBe(char + ' = ' + char + ' = ' + char, n); + i++; + } + // not really necessary to assert this, but we tested 355 characters, + // weighted toward the low end but continuing all the way to the + // end of the unicode definition + expect(i).toBe(355); + }); + + it('does not decode entities prematurely', function() { + var testCases = [ + '<b>not boldnot bold</b>', + '', + '<b>not bold</b>' + ]; + testCases.forEach(function(testCase) { + var node = mockTextSVGElement(testCase); + + expect(node.html()).toBe( + '<b>not bold</b>', testCase + ); + }); + + var controlNode = mockTextSVGElement('bold'); + expect(controlNode.html()).toBe( + 'bold' + ); }); it('supports superscript by itself', function() {