Skip to content

Commit b8410ff

Browse files
Cazenrxin
authored andcommitted
[SPARK-12537][SQL] Add option to accept quoting of all character backslash quoting mechanism
We can provides the option to choose JSON parser can be enabled to accept quoting of all character or not. Author: Cazen <Cazen@korea.com> Author: Cazen Lee <cazen.lee@samsung.com> Author: Cazen Lee <Cazen@korea.com> Author: cazen.lee <cazen.lee@samsung.com> Closes apache#10497 from Cazen/master.
1 parent 7b92922 commit b8410ff

File tree

4 files changed

+30
-2
lines changed

4 files changed

+30
-2
lines changed

python/pyspark/sql/readwriter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ def json(self, path, schema=None):
160160
quotes
161161
* ``allowNumericLeadingZeros`` (default ``false``): allows leading zeros in numbers \
162162
(e.g. 00012)
163+
* ``allowBackslashEscapingAnyCharacter`` (default ``false``): allows accepting quoting \
164+
of all character using backslash quoting mechanism
163165
164166
>>> df1 = sqlContext.read.json('python/test_support/sql/people.json')
165167
>>> df1.dtypes

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,8 @@ class DataFrameReader private[sql](sqlContext: SQLContext) extends Logging {
258258
* </li>
259259
* <li>`allowNumericLeadingZeros` (default `false`): allows leading zeros in numbers
260260
* (e.g. 00012)</li>
261+
* <li>`allowBackslashEscapingAnyCharacter` (default `false`): allows accepting quoting of all
262+
* character using backslash quoting mechanism</li>
261263
*
262264
* @since 1.6.0
263265
*/

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JSONOptions.scala

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ case class JSONOptions(
3131
allowUnquotedFieldNames: Boolean = false,
3232
allowSingleQuotes: Boolean = true,
3333
allowNumericLeadingZeros: Boolean = false,
34-
allowNonNumericNumbers: Boolean = false) {
34+
allowNonNumericNumbers: Boolean = false,
35+
allowBackslashEscapingAnyCharacter: Boolean = false) {
3536

3637
/** Sets config options on a Jackson [[JsonFactory]]. */
3738
def setJacksonOptions(factory: JsonFactory): Unit = {
@@ -40,6 +41,8 @@ case class JSONOptions(
4041
factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes)
4142
factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros)
4243
factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers)
44+
factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
45+
allowBackslashEscapingAnyCharacter)
4346
}
4447
}
4548

@@ -59,6 +62,8 @@ object JSONOptions {
5962
allowNumericLeadingZeros =
6063
parameters.get("allowNumericLeadingZeros").map(_.toBoolean).getOrElse(false),
6164
allowNonNumericNumbers =
62-
parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true)
65+
parameters.get("allowNonNumericNumbers").map(_.toBoolean).getOrElse(true),
66+
allowBackslashEscapingAnyCharacter =
67+
parameters.get("allowBackslashEscapingAnyCharacter").map(_.toBoolean).getOrElse(false)
6368
)
6469
}

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonParsingOptionsSuite.scala

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,4 +111,23 @@ class JsonParsingOptionsSuite extends QueryTest with SharedSQLContext {
111111
assert(df.schema.head.name == "age")
112112
assert(df.first().getDouble(0).isNaN)
113113
}
114+
115+
test("allowBackslashEscapingAnyCharacter off") {
116+
val str = """{"name": "Cazen Lee", "price": "\$10"}"""
117+
val rdd = sqlContext.sparkContext.parallelize(Seq(str))
118+
val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "false").json(rdd)
119+
120+
assert(df.schema.head.name == "_corrupt_record")
121+
}
122+
123+
test("allowBackslashEscapingAnyCharacter on") {
124+
val str = """{"name": "Cazen Lee", "price": "\$10"}"""
125+
val rdd = sqlContext.sparkContext.parallelize(Seq(str))
126+
val df = sqlContext.read.option("allowBackslashEscapingAnyCharacter", "true").json(rdd)
127+
128+
assert(df.schema.head.name == "name")
129+
assert(df.schema.last.name == "price")
130+
assert(df.first().getString(0) == "Cazen Lee")
131+
assert(df.first().getString(1) == "$10")
132+
}
114133
}

0 commit comments

Comments
 (0)