[go: nahoru, domu]

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

json2xml sanitizes even when sanitize is set to false. #26

Open
adamgcraig opened this issue Aug 17, 2017 · 20 comments
Open

json2xml sanitizes even when sanitize is set to false. #26

adamgcraig opened this issue Aug 17, 2017 · 20 comments

Comments

@adamgcraig
Copy link

Also, xml2json crashes if it encounters an unescaped <, >, or & in the XML, but I am not sure whether you would consider that a bug.

Input:

<?xml version="1.0" encoding="UTF-8"?>
<note>
  <to>xml-js</to>
  <from>ACraig</from>
  <heading>Min Example</heading>
  <body>Here are some characters that get sanitized: " '</body>
</note>

Code:

var fs = require('fs');
var converter = require('xml-js');

fs.readFile('minexample.xml', 'utf8', function(err, xml) {
  if (err) throw err;
  console.log('XML:');
  console.log(xml);
  console.log('compact JSON:');
  var json1 = converter.xml2json(xml, {compact: true, spaces: 2});
  console.log(json1);
  console.log('back-converted XML from compact:');
  xmlback1 = converter.json2xml(json1, {compact: true, spaces: 2, sanitize: false});
  console.log(xmlback1);
  console.log('matches original:');
  console.log(xmlback1 == xml);
  fs.writeFile('backfromcompact.xml', xmlback1, function(err) {
    if(err) {
      return console.log(err);
    }
    console.log('Saved backfromcompact.xml.');
  });
  console.log('verbose JSON:');
  var json2 = converter.xml2json(xml, {compact: false, spaces: 2});
  console.log(json2);
  console.log('back-converted XML from verbose:');
  xmlback2 = converter.json2xml(json2, {compact: false, spaces: 2, sanitize: false});
  console.log(xmlback2);
  console.log('matches original:');
  console.log(xmlback2 == xml);
  fs.writeFile('backfromverbose.xml', xmlback2, function(err) {
    if(err) {
      return console.log(err);
    }
    console.log('Saved backfromverbose.xml.');
  });
});

Output:

XML:

<?xml version="1.0" encoding="UTF-8"?>
<note>
  <to>xml-js</to>
  <from>ACraig</from>
  <heading>Min Example</heading>
  <body>Here are some characters that get sanitized: " '</body>
</note>

compact JSON:

{
  "_declaration": {
    "_attributes": {
      "version": "1.0",
      "encoding": "UTF-8"
    }
  },
  "note": {
    "to": {
      "_text": "xml-js"
    },
    "from": {
      "_text": "ACraig"
    },
    "heading": {
      "_text": "Min Example"
    },
    "body": {
      "_text": "Here are some characters that get sanitized: \" '"
    }
  }
}

back-converted XML from compact:

<?xml version="1.0" encoding="UTF-8"?>
<note>
  <to>xml-js</to>
  <from>ACraig</from>
  <heading>Min Example</heading>
  <body>Here are some characters that get sanitized: &quot; &#39;</body>
</note>

matches original:
false
verbose JSON:

{
  "declaration": {
    "attributes": {
      "version": "1.0",
      "encoding": "UTF-8"
    }
  },
  "elements": [
    {
      "type": "element",
      "name": "note",
      "elements": [
        {
          "type": "element",
          "name": "to",
          "elements": [
            {
              "type": "text",
              "text": "xml-js"
            }
          ]
        },
        {
          "type": "element",
          "name": "from",
          "elements": [
            {
              "type": "text",
              "text": "ACraig"
            }
          ]
        },
        {
          "type": "element",
          "name": "heading",
          "elements": [
            {
              "type": "text",
              "text": "Min Example"
            }
          ]
        },
        {
          "type": "element",
          "name": "body",
          "elements": [
            {
              "type": "text",
              "text": "Here are some characters that get sanitized: \" '"
            }
          ]
        }
      ]
    }
  ]
}

back-converted XML from verbose:

<?xml version="1.0" encoding="UTF-8"?>
<note>
  <to>xml-js</to>
  <from>ACraig</from>
  <heading>Min Example</heading>
  <body>Here are some characters that get sanitized: &quot; &#39;</body>
</note>

matches original:
false
Saved backfromcompact.xml.
Saved backfromverbose.xml.

@bidiu
Copy link
bidiu commented Aug 18, 2017

Yes, I have a similar issue, if not exactly same. I think thejs2xml() should have the sanitize option just as the xml2js() does. Otherwise, once you use xml2js(xml, { sanitize: true }) to parse xml, you will never get the exact same xml again from the parsed intermediate json.

An example

The original xml:

<title>Support and resistance &amp;</title>

If you use sanitize: true, the JavaScript object will be:

{
    type: 'text',
    text: 'Support and resistance &amp;'
}

And then, the generated xml from this JS object will be:

<title>Support and resistance &amp;amp;</title>

So, basically, I think there needs to be an option to tell the library treating anything inside 'text' as it is, never escape or parse anything inside.

@nashwaan
Copy link
Owner

I agree to both comments.
I will publish an updated version to address this issue soon.

@nashwaan
Copy link
Owner
nashwaan commented Aug 19, 2017

I have published v1.4.0 to fix this issue.

I use the answers in this StackOverflow question as the basis for the fix.

Previously, when converting from js to xml, I was escaping 5 characters: & < > " ' and replace them with &amp; &lt; &gt; &quot; &#39; respectively. I did this for text of element and its attribute. The reason that I did this is because I followed the recommendation in the most popular answer in that S.O. question.

The safe way is to escape all five characters in text
The safe way is to escape all five characters in attributes

But since this approach causes issues as you demonstrated, now I am adopting the recommendation in this answer.

For tags:
< &lt;
> &gt;
& &amp;
For attributes:
" &quot;
' &apos;

But note that I am not escaping ' in attributes because I always write attributes surrounded by "" and thus it is safe to keep ' characters unescpaed.

The new implementation can cause a slight breaking change:

  • Converting from xml to js will decode the 5 entity codes &amp; &lt; &gt; &quot; &#39; into & < > " '. For example, &amp; WILL be changed to & character (instead of keeping it &amp;). Also, sanitize option is deprecated.
  • Converting from js to xml will cause 3 characters & < > (instead of 5 characters) if found in node text to be transformed into &amp; &lt; &gt;. And will cause 1 character " (instead of 5 characters) if found in node attribute to be transformed into &quot;.

@nashwaan
Copy link
Owner

@adamgcraig, Regarding

xml2json crashes if it encounters an unescaped <, >, or & in the XML, but I am not sure whether you would consider that a bug.

This library assumes xml input and json input are well-formed documents without issues.

@bidiu
Copy link
bidiu commented Aug 19, 2017

@nashwaan Thanks. I will test the new version with my use case when I have time.

@bidiu
Copy link
bidiu commented Aug 24, 2017

I already tested it, right now the version 1.4.1 works great for my case ; P

@nashwaan
Copy link
Owner
nashwaan commented Oct 2, 2017

Seems this issue is resolved, I will close it.

@nashwaan nashwaan closed this as completed Oct 2, 2017
@austin-laney
Copy link

Sorry to open a closed can of worms, but I'm finding an issue that the '<' '>' characters, when they are functioning as a string in an attribute aren't being converted back to &lt and &gt when going to xml from json. Any ideas? Thanks

@nashwaan
Copy link
Owner
nashwaan commented Jan 3, 2018

@austin-laney , can you provide a simple and complete example, please?

You might also want to check Custom Processing Callbacks as a quick solution to handle this issue. Look for attributeValueFn.

But if you can provide a clear example, I might consider changing the core behavior of handling special characters in attribute values.

@austin-laney
Copy link
austin-laney commented Jan 3, 2018

@nashwaan, Thank you for the swift reply.
Here's an example of the input xml:

<parser start="^\s*?&lt;name&gt;regex&lt;/name&gt;$">

The start attribute denotes where a parser will begin extracting data and it's value is a regex. When I analyze how it's performing whilst debugging, here is the value when broken at a point to see the xml-js value:

regex

It does parse the tags into their respective tokens but on the conversion to js->xml it does not change it back. Any help is appreciated!

Also, if I'm not mistaken the attributeValueFn would be supplied in the options correct? I tried that and it doesn't seem to be available.

@nashwaan
Copy link
Owner
nashwaan commented Jan 5, 2018

This library uses sax for parsing xml. It always converts predefined XML entities in attributes (like &lt; to <). I could not disable this behavior. At the moment, I don't know how to preserve original attributes content when converting from xml to json.

But, to convert json back to xml, you can use something like this:

var convert = require('xml-js');
var xml = '<parser start="^\\s*?&lt;name&gt;regex&lt;/name&gt;$"/>';
var js = convert.xml2js(xml, {compact: true});
var xml_ = convert.js2xml(js, {compact: true, attributeValueFn: function(value) {
  return value.replace(/</g, '&lt;').replace(/>/g, '&gt;');
}});
console.log('xml -> js -> xml was ' + (xml === xml_? 'ok' : 'not ok'));

@austin-laney
Copy link

Does jsontoxml support the attributeValFn? It does not recognize that when I attempt to build.

@nashwaan
Copy link
Owner
nashwaan commented Jan 6, 2018

It should be attributeValueFn (not attributeValFn). Also, it is json2xml (not jsontoxml) and it supports attributeValueFn.
If you are still facing trouble with this, try using my previous code and gradually adjusted it to your needs.

@nmlyons
Copy link
nmlyons commented Mar 22, 2018

@nashwaan I have a use case that is very related to this issue -- I hope this is a good place to ask the question.

I am using js2xml, and trying to encode the & within xml attribute values. However, since the " character is encoded by default, and encoded before the attributeValueFn is executed, replacing & causes &quot; to be converted to &amp;quot; .

Is it possible to execute the attributeValueFn before converting " to &quot; ? Alternatively, can an option be re-added to js2xml to convert the 5 special characters within attribute values?

My current workaround is to use this attributeValueFn:

const encodeHTML = function (attributeValue) {
    return attributeValue.replace(/&quot;/g, '"')  // convert quote back before converting amp
        .replace(/&/g, '&amp;')
        .replace(/</g, '&lt;')
        .replace(/>/g, '&gt;')
        .replace(/"/g, '&quot;')
        .replace(/'/g, '&apos;');
};

Since xml2js already converts the escape sequences back to special characters, I feel this should be a suitable use-case to close the loop.

Please let me know if any more detail would help!

Thanks

@nashwaan
Copy link
Owner
nashwaan commented Jul 3, 2018

Opened this issue because of @nmlyons comment and @eug48 has raised an issue #69.

Will work on solution to address this. Most likely, I will escape all 5 characters: & < > " ' in both text of tags AND attributes.

@zengfenfei
Copy link

Yes, I have a similar issue, if not exactly same. I think thejs2xml() should have the sanitize option just as the xml2js() does. Otherwise, once you use xml2js(xml, { sanitize: true }) to parse xml, you will never get the exact same xml again from the parsed intermediate json.

@nashwaan , as @bidiu mention, so will you add sanitize option for js2xml()

I'm generating some kind of view file written in XML:

<view wx:if="{{length > 5}}"> </view>

The greater than operator > in the above example donates as compare logic which can't escaped to &gt;.

@fkirc
Copy link
fkirc commented Oct 12, 2020

For example, & WILL be changed to & character (instead of keeping it &).

I am still running into problems because of this &amp to $-conversion.
This happens when converting from XML to JSON (reading an XML-file).
Therefore, it would be nice to have an option that does not change any characters at all.
For my use case, I need the exact same characters that are between my XML-tags to arrive as JSON-strings.

@Shubu15
Copy link
Shubu15 commented Jun 30, 2021

@nashwaan I am using xml-js library aggressively. Due to this issue, I spent time more time on r&d. Can you please resolve this issue ASAP?

@dalexander-trc
Copy link

I face a similar issue, where xml2js properly reads in characters within text fields, but js2xml sanitizes them, so the input and output XML will never be identical.

Input

<Title>>400</Title>

Output

<Title>&gt;400</Title>

@sonjz
Copy link
sonjz commented Mar 28, 2022

noticed this issue loading Adobe Illustrator SVGs.
Interestingly only got stopped at the ; on the <sfw xmlns="&ns_sfw;"> line, vs eariler <svg version="1.1" id="Layer_1" xmlns:x="&ns_extend;" xmlns:i="&ns_ai;" xmlns:graph="&ns_graphs; xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="96.014px" height="99.997px" viewBox="0 0 96.014 99.997" enable-background="new 0 0 96.014 99.997" xml:space="preserve"> line seemed to pass...

let xmlJs = require("xml-js");
let test = `
<?xml version="1.0" encoding="utf-8"?>
<!-- Generator: Adobe Illustrator 16.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" [
	<!ENTITY ns_extend "http://ns.adobe.com/Extensibility/1.0/">
	<!ENTITY ns_ai "http://ns.adobe.com/AdobeIllustrator/10.0/">
	<!ENTITY ns_graphs "http://ns.adobe.com/Graphs/1.0/">
	<!ENTITY ns_vars "http://ns.adobe.com/Variables/1.0/">
	<!ENTITY ns_imrep "http://ns.adobe.com/ImageReplacement/1.0/">
	<!ENTITY ns_sfw "http://ns.adobe.com/SaveForWeb/1.0/">
	<!ENTITY ns_custom "http://ns.adobe.com/GenericCustomNamespace/1.0/">
	<!ENTITY ns_adobe_xpath "http://ns.adobe.com/XPath/1.0/">
]>
<svg version="1.1" id="Layer_1" xmlns:x="&ns_extend;" xmlns:i="&ns_ai;" xmlns:graph="&ns_graphs;"
	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" width="96.014px"
	 height="99.997px" viewBox="0 0 96.014 99.997" enable-background="new 0 0 96.014 99.997" xml:space="preserve">
<metadata>
	<sfw  xmlns="&ns_sfw;">
		<slices></slices>
		<sliceSourceBounds  width="86.833" height="86.833" x="276.083" y="437.084" bottomLeftOrigin="true"></sliceSourceBounds>
	</sfw>
</metadata>
<path d="M86.639,47.977l-9.519-6.694L54.427,73.577L36.495,60.054L20.938,81.149c-0.754,1.025-1.925,1.567-3.109,1.567
	c-0.801,0-1.6-0.237-2.288-0.746c-1.719-1.271-2.083-3.685-0.82-5.404l20.188-27.371l17.793,13.4l18.098-25.753l-9.374-6.592
	l27.954-12.971L86.639,47.977z"/>
<rect x="4.59" y="6.582" width="7.333" height="86.833"/>
<rect x="4.59" y="86.081" width="86.833" height="7.334"/>
</svg>
`;

xmlJs.xml2js(test);

Error returned

Uncaught:
<ref *1> Error: Invalid character entity
Line: 17
Column: 22
Char: ;
    at error (/REDACTED/node_modules/sax/lib/sax.js:651:10)
    at strictFail (/REDACTED/node_modules/sax/lib/sax.js:677:7)
    at parseEntity (REDACTEDnode_modules/sax/lib/sax.js:937:7)
    at SAXParser.write (/REDACTED/node_modules/sax/lib/sax.js:1485:31)
    at Object.module.exports [as xml2js] (/REDACTED/node_modules/xml-js/lib/xml2js.js:346:12) {
  note: [Circular *1]
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

10 participants