Fixed validation regular expressions (GoogleCloudPlatform#3332)

davidcavazos · web-flow · commit e72e7cb46c43 · 2020-07-14T09:48:44.000-07:00
* Fixed validation regular expressions

Updated the Pub/Sub regular expression to what Pub/Sub actually validates to.

* Linked to resources for validation

* Fixed lint warnings
diff --git a/dataflow/flex-templates/streaming_beam_sql/Dockerfile b/dataflow/flex-templates/streaming_beam_sql/Dockerfile
@@ -17,7 +17,7 @@
 FROM gcr.io/dataflow-templates-base/java11-template-launcher-base:latest
 
 # Define the Java command options required by Dataflow Flex Templates.
-ENV FLEX_TEMPLATE_JAVA_MAIN_CLASS="org.apache.beam.samples.StreamingBeamSQL"
+ENV FLEX_TEMPLATE_JAVA_MAIN_CLASS="org.apache.beam.samples.StreamingBeamSql"
 ENV FLEX_TEMPLATE_JAVA_CLASSPATH="/template/pipeline.jar"
 
 # Make sure to package as an uber-jar including all dependencies.
diff --git a/dataflow/flex-templates/streaming_beam_sql/README.md b/dataflow/flex-templates/streaming_beam_sql/README.md
@@ -101,7 +101,7 @@ to transform the message data, and writes the results to a
 [BigQuery](https://cloud.google.com/bigquery) table.
 
 * [Dockerfile](Dockerfile)
-* [StreamingBeamSQL.java](src/main/java/org/apache/beam/samples/StreamingBeamSQL.java)
+* [StreamingBeamSql.java](src/main/java/org/apache/beam/samples/StreamingBeamSql.java)
 * [pom.xml](pom.xml)
 * [metadata.json](metadata.json)
 
@@ -114,7 +114,7 @@ to transform the message data, and writes the results to a
 >
 > ```sh
 > mvn compile exec:java \
->   -Dexec.mainClass=org.apache.beam.samples.StreamingBeamSQL \
+>   -Dexec.mainClass=org.apache.beam.samples.StreamingBeamSql \
 >   -Dexec.args="\
 >     --project=$PROJECT \
 >     --inputSubscription=$SUBSCRIPTION \
@@ -189,6 +189,12 @@ necessary information to run the job, such as the SDK information and metadata.
 The [`metadata.json`](metadata.json) file contains additional information for
 the template such as the "name", "description", and input "parameters" field.
 
+We used
+[regular expressions](https://docs.microsoft.com/en-us/dotnet/standard/base-types/regular-expression-language-quick-reference)
+for validation on the input
+[Pub/Sub subscription](https://cloud.google.com/pubsub/docs/admin#resource_names)
+and [BigQuery table](https://cloud.google.com/bigquery/docs/tables#table_naming).
+
 The template file must be created in a Cloud Storage location,
 and is used to run a new Dataflow job.
 
diff --git a/dataflow/flex-templates/streaming_beam_sql/metadata.json b/dataflow/flex-templates/streaming_beam_sql/metadata.json
@@ -7,7 +7,7 @@
       "label": "Pub/Sub input subscription.",
       "helpText": "Pub/Sub subscription to read from.",
       "regexes": [
-        "[-_.a-zA-Z0-9]+"
+        "(?!goog)[a-zA-Z][-_.~+%a-zA-Z0-9]{2,}"
       ]
     },
     {
diff --git a/dataflow/flex-templates/streaming_beam_sql/pom.xml b/dataflow/flex-templates/streaming_beam_sql/pom.xml
@@ -19,6 +19,12 @@
   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
 
+  <parent>
+    <groupId>com.google.cloud.samples</groupId>
+    <artifactId>shared-configuration</artifactId>
+    <version>1.0.17</version>
+  </parent>
+
   <groupId>org.apache.beam.samples</groupId>
   <artifactId>streaming-beam-sql</artifactId>
   <version>1.0</version>
diff --git a/dataflow/flex-templates/streaming_beam_sql/src/main/java/org/apache/beam/samples/StreamingBeamSql.java b/dataflow/flex-templates/streaming_beam_sql/src/main/java/org/apache/beam/samples/StreamingBeamSql.java
@@ -1,47 +1,41 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.beam.samples;
+// Copyright 2020 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
-import java.util.Arrays;
+package org.apache.beam.samples;
 
 import com.google.api.services.bigquery.model.TableFieldSchema;
 import com.google.api.services.bigquery.model.TableRow;
 import com.google.api.services.bigquery.model.TableSchema;
 import com.google.gson.Gson;
 import com.google.pubsub.v1.ProjectSubscriptionName;
-
+import java.util.Arrays;
 import org.apache.avro.reflect.Nullable;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.coders.AvroCoder;
 import org.apache.beam.sdk.coders.DefaultCoder;
 import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 import org.apache.beam.sdk.extensions.sql.SqlTransform;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
 import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.Description;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.options.StreamingOptions;
 import org.apache.beam.sdk.options.Validation;
 import org.apache.beam.sdk.schemas.Schema;
-import org.apache.beam.sdk.transforms.Create;
 import org.apache.beam.sdk.transforms.MapElements;
 import org.apache.beam.sdk.transforms.WithTimestamps;
 import org.apache.beam.sdk.transforms.windowing.FixedWindows;
@@ -57,34 +51,40 @@
  * An Apache Beam streaming pipeline that reads JSON encoded messages fromPub/Sub,
  * uses Beam SQL to transform the message data, and writes the results to a BigQuery.
  */
-public class StreamingBeamSQL {
-  private static final Logger LOG = LoggerFactory.getLogger(StreamingBeamSQL.class);
+public class StreamingBeamSql {
+  private static final Logger LOG = LoggerFactory.getLogger(StreamingBeamSql.class);
   private static final Gson GSON = new Gson();
 
   public interface Options extends StreamingOptions {
     @Description("Pub/Sub subscription to read from.")
     @Validation.Required
     String getInputSubscription();
+
     void setInputSubscription(String value);
 
-    @Description("BigQuery table to write to, in the form 'project:dataset.table' or 'dataset.table'.")
+    @Description("BigQuery table to write to, in the form "
+        + "'project:dataset.table' or 'dataset.table'.")
     @Default.String("beam_samples.streaming_beam_sql")
     String getOutputTable();
+
     void setOutputTable(String value);
   }
 
   @DefaultCoder(AvroCoder.class)
   private static class PageReviewMessage {
-    @Nullable String url;
-    @Nullable String review;
+    @Nullable
+    String url;
+    @Nullable
+    String review;
   }
 
   public static void main(final String[] args) {
     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
     options.setStreaming(true);
 
     var project = options.as(GcpOptions.class).getProject();
-    var subscription = ProjectSubscriptionName.of(project, options.getInputSubscription()).toString();
+    var subscription = ProjectSubscriptionName
+        .of(project, options.getInputSubscription()).toString();
 
     var schema = Schema.builder()
         .addStringField("url")
@@ -96,47 +96,50 @@ public static void main(final String[] args) {
     pipeline
         // Read, parse, and validate messages from Pub/Sub.
         .apply("Read messages from Pub/Sub", PubsubIO.readStrings().fromSubscription(subscription))
-        .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class)).via(message -> {
-          // This is a good place to add error handling.
-          // The first transform should act as a validation layer to make sure
-          // that any data coming to the processing pipeline must be valid.
-          // See `MapElements.MapWithFailures` for more details.
-          LOG.info("message: {}", message);
-          var msg = GSON.fromJson(message, PageReviewMessage.class);
-          return Row.withSchema(schema).addValues(
-              msg.url,                                    // row url
-              msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
-              new Instant()                               // row processing_time
-          ).build();
-        })).setRowSchema(schema) // make sure to set the row schema for the PCollection
+        .apply("Parse JSON into SQL rows", MapElements.into(TypeDescriptor.of(Row.class))
+            .via(message -> {
+              // This is a good place to add error handling.
+              // The first transform should act as a validation layer to make sure
+              // that any data coming to the processing pipeline must be valid.
+              // See `MapElements.MapWithFailures` for more details.
+              LOG.info("message: {}", message);
+              var msg = GSON.fromJson(message, PageReviewMessage.class);
+              return Row.withSchema(schema).addValues(
+                  msg.url,                                    // row url
+                  msg.review.equals("positive") ? 1.0 : 0.0,  // row page_score
+                  new Instant()                               // row processing_time
+              ).build();
+            })).setRowSchema(schema) // make sure to set the row schema for the PCollection
 
         // Add timestamps and bundle elements into windows.
-        .apply("Add processing time", WithTimestamps.of((row) -> row.getDateTime("processing_time").toInstant()))
+        .apply("Add processing time", WithTimestamps
+            .of((row) -> row.getDateTime("processing_time").toInstant()))
         .apply("Fixed-size windows", Window.into(FixedWindows.of(Duration.standardMinutes(1))))
 
         // Apply a SQL query for every window of elements.
         .apply("Run Beam SQL query", SqlTransform.query(
-            "SELECT " +
-            "  url, " +
-            "  COUNT(page_score) AS num_reviews, " +
-            "  AVG(page_score) AS score, " +
-            "  MIN(processing_time) AS first_date, " +
-            "  MAX(processing_time) AS last_date " +
-            "FROM PCOLLECTION " +
-            "GROUP BY url"
+            "SELECT "
+                + "  url, "
+                + "  COUNT(page_score) AS num_reviews, "
+                + "  AVG(page_score) AS score, "
+                + "  MIN(processing_time) AS first_date, "
+                + "  MAX(processing_time) AS last_date "
+                + "FROM PCOLLECTION "
+                + "GROUP BY url"
         ))
 
         // Convert the SQL Rows into BigQuery TableRows and write them to BigQuery.
-        .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class)).via(row -> {
-          LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"), row.getString("url"),
-              row.getInt64("num_reviews"));
-          return new TableRow()
-              .set("url", row.getString("url"))
-              .set("num_reviews", row.getInt64("num_reviews"))
-              .set("score", row.getDouble("score"))
-              .set("first_date", row.getDateTime("first_date").toInstant().toString())
-              .set("last_date", row.getDateTime("last_date").toInstant().toString());
-        }))
+        .apply("Convert to BigQuery TableRow", MapElements.into(TypeDescriptor.of(TableRow.class))
+            .via(row -> {
+              LOG.info("rating summary: {} {} ({} reviews)", row.getDouble("score"),
+                  row.getString("url"), row.getInt64("num_reviews"));
+              return new TableRow()
+                  .set("url", row.getString("url"))
+                  .set("num_reviews", row.getInt64("num_reviews"))
+                  .set("score", row.getDouble("score"))
+                  .set("first_date", row.getDateTime("first_date").toInstant().toString())
+                  .set("last_date", row.getDateTime("last_date").toInstant().toString());
+            }))
         .apply("Write to BigQuery", BigQueryIO.writeTableRows()
             .to(options.getOutputTable())
             .withSchema(new TableSchema().setFields(Arrays.asList(

Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`"label": "Pub/Sub input subscription.",`
`8`	`8`	`"helpText": "Pub/Sub subscription to read from.",`
`9`	`9`	`"regexes": [`
`10`		`- "[-_.a-zA-Z0-9]+"`
	`10`	`+ "(?!goog)[a-zA-Z][-_.~+%a-zA-Z0-9]{2,}"`
`11`	`11`	`]`
`12`	`12`	`},`
`13`	`13`	`{`