diff --git a/scripts/README.md b/scripts/README.md
index 9a3b948b8..27cef9c9b 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -2,4 +2,5 @@
Common script utilities which can help automate common tasks within a BigQuery data warehouse.
-* [billing](/scripts/billing) - Example queries over the GCP billing export to help get you started using the standard data export format
\ No newline at end of file
+* [billing](/scripts/billing) - Example queries over the GCP billing export to help get you started using the standard data export format.
+* [optimization](/scripts/optimization) - Scripts that identify tables and queries which require optimization.
\ No newline at end of file
diff --git a/scripts/optimization/README.md b/scripts/optimization/README.md
new file mode 100644
index 000000000..3aff72c85
--- /dev/null
+++ b/scripts/optimization/README.md
@@ -0,0 +1,443 @@
+# Optimization Scripts
+
+This folder contains scripts that (when executed) create a dataset
+named, `optimization_workshop`, with several tables inside the dataset. \
+These tables are populated with information to help you optimize your BigQuery
+tables, views, and queries.
+
+Run all the scripts within this folder using the following commands:
+
+```bash
+gcloud auth login &&
+bash run_all_scripts.sh
+```
+
+The scripts are described in more detail in the following sections.
+
+---
+
+# Project Analysis
+
+Project level analysis enables us to understand key metrics such as slot_time,
+bytes_scanned, bytes_shuffled and bytes_spilled on a daily basis within a
+project. The metrics are examined as averages, medians and p80s. This enables us
+to understand at a high level what jobs within a project consume 80% of the time
+and 50% of the time daily.
+
+🔍 Daily project metrics
+
+## Daily project metrics
+
+The [daily_project_analysis.sql](daily_project_analysis.sql) script creates a
+table called,
+`daily_project_analysis` of daily slot consumption metrics (for a 30day period)
+for all your projects.
+
+### Examples of querying script results
+
+* Top 100 tables with the highest slot consumption
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.daily_project_analysis
+ ORDER BY total_slot_ms DESC
+ LIMIT 100
+ ```
+
+
+
+# Table Analysis
+
+🔍 BigQuery Clustering/Partitioning Recommender Tool
+
+## BigQuery Clustering/Partitioning Recommender Tool
+
+The BigQuery partitioning and clustering recommender analyzes workflows on your
+BigQuery tables and offers recommendations to better optimize your workflows and
+query costs using either table partitioning or table clustering. The recommender
+uses your BigQuery's workload execution data from the past 30 days to analyze
+each BigQuery table for suboptimal partitioning and clustering configurations.
+
+> [!IMPORTANT]
+> Before you can view partition and cluster recommendations, you
+> need to [enable the Recommender API](https://cloud.google.com/recommender/docs/enabling)
+> as shown in the following sections.
+
+### Enable using gcloud
+
+```bash
+# The following script retrieves all distinct projects from the JOBS_BY_ORGANIZATION view
+# and then enables the recommender API for each project.
+projects=$(
+ bq query \
+ --nouse_legacy_sql \
+ --format=csv \
+ "SELECT DISTINCT project_id FROM \`region-us\`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION" \
+ | sed 1d
+);
+for proj in $projects; do
+ gcloud services --project="${proj}" enable recommender.googleapis.com &
+done
+```
+
+### Enable using Terraform
+
+```hcl
+resource "google_project_service" "recommender_service" {
+ project = "your-project"
+ service = "recommender.googleapis.com"
+}
+```
+
+### View your partition and cluster recommendations
+
+Once you've enabled the Recommender API,
+you can [view your partition and cluster recommendations](https://cloud.google.com/bigquery/docs/view-partition-cluster-recommendations#view_recommendations)
+in the Cloud Console or via the gcloud command-line tool.
+
+> [!NOTE]
+> The most scalable method for viewing your partition and cluster
+> recommendations is to export your recommendations to BigQuery.
+> You can do this by creating a Data Transfer Service (DTS) job to export your
+> recommendations to BigQuery.
+> See [Exporting recommendations to BigQuery](https://cloud.google.com/recommender/docs/bq-export/export-recommendations-to-bq#create_a_data_transfer_for_recommendations)
+> for more information.
+
+Once you set up the DTS job to export your recommendations to BigQuery, you can
+run the following query to get the most recent recommendations for partitioning
+and clustering your tables.
+
+```sql
+CREATE TEMP FUNCTION extract_table(target_resources ARRAY) AS((
+ SELECT ARRAY_AGG(
+ REGEXP_REPLACE(REGEXP_EXTRACT(target_resource, r'\/projects\/(.*?\/datasets\/.*?\/tables\/.*)'), "(/datasets/|/tables/)", ".")
+ )
+ FROM UNNEST(target_resources) target_resource)[OFFSET(0)]
+);
+SELECT
+ MAX(last_refresh_time) AS latest_recommendation_time,
+ recommender_subtype AS recommendation,
+ MAX_BY(JSON_VALUE_ARRAY(COALESCE(
+ PARSE_JSON(recommendation_details).overview.partitionColumns,
+ PARSE_JSON(recommendation_details).overview.clusterColumns)),
+ last_refresh_time) AS columns_to_cluster_or_partition,
+ extract_table(target_resources) AS table,
+ bqutil.fn.table_url(extract_table(target_resources)) AS table_url,
+ JSON_VALUE(PARSE_JSON(recommendation_details).overview.partitionTimeUnit) AS partition_time_unit,
+-- Replace the table below with your own table that you created when you
+-- set up the DTS job to export your recommendations to BigQuery.
+FROM YOUR_PROJECT.YOUR_DATASET.recommendations_export
+WHERE recommender = "google.bigquery.table.PartitionClusterRecommender"
+GROUP BY recommendation, table, partition_time_unit, table_url
+```
+
+
+
+🔍 Tables with query read patterns
+
+## Tables with query read patterns
+
+The [table_read_patterns.sql](table_read_patterns.sql) script creates a table
+named, `table_read_patterns`, that contains usage data to help you determine:
+
+* Which tables (when queried) are resulting in high slot consumption.
+* Which tables are most frequently queried.
+
+### Examples of querying script results
+
+* Tables grouped by similar filter predicates
+
+ ```sql
+ SELECT
+ table_id,
+ bqutil.fn.table_url(table_id) AS table_url,
+ (SELECT STRING_AGG(column ORDER BY COLUMN) FROM UNNEST(predicates)) column_list,
+ (SELECT STRING_AGG(operator ORDER BY COLUMN) FROM UNNEST(predicates)) operator_list,
+ (SELECT STRING_AGG(value ORDER BY COLUMN) FROM UNNEST(predicates)) value_list,
+ SUM(stage_slot_ms) AS total_slot_ms,
+ COUNT(DISTINCT DATE(creation_time)) as num_days_queried,
+ COUNT(*) AS num_occurrences,
+ COUNT(DISTINCT job_id) as job_count,
+ ARRAY_AGG(CONCAT(project_id,':us.',job_id) ORDER BY total_slot_ms LIMIT 10) AS job_id_array,
+ ARRAY_AGG(bqutil.fn.job_url(project_id || ':us.' || job_id)) AS job_url_array,
+ FROM optimization_workshop.table_read_patterns
+ GROUP BY 1,2,3,4,5;
+ ```
+
+* Top 100 tables with the highest slot consumption
+
+ ```sql
+ SELECT
+ table_id,
+ bqutil.fn.table_url(table_id) AS table_url,
+ SUM(stage_slot_ms) AS total_slot_ms,
+ COUNT(DISTINCT DATE(creation_time)) as num_days_queried,
+ COUNT(*) AS num_occurrences,
+ COUNT(DISTINCT job_id) as job_count,
+ FROM optimization_workshop.table_read_patterns
+ GROUP BY 1,2
+ ORDER BY total_slot_ms DESC
+ LIMIT 100
+ ```
+
+* Top 100 most frequently queried tables
+
+ ```sql
+ SELECT
+ table_id,
+ bqutil.fn.table_url(table_id) AS table_url,
+ SUM(stage_slot_ms) AS total_slot_ms,
+ COUNT(DISTINCT DATE(creation_time)) as num_days_queried,
+ COUNT(*) AS num_occurrences,
+ COUNT(DISTINCT job_id) as job_count,
+ FROM optimization_workshop.table_read_patterns
+ GROUP BY 1,2
+ ORDER BY num_occurrences DESC
+ LIMIT 100
+ ```
+
+
+
+🔍 Tables without partitioning or clustering
+
+## Tables without partitioning or clustering
+
+The [tables_without_partitioning_or_clustering.sql](tables_without_partitioning_or_clustering.sql)
+script creates a table named, `tables_without_part_clust`,
+that contains a list of tables which meet any of the following conditions:
+
+- not partitioned
+- not clustered
+- neither partitioned nor clustered
+
+### Examples of querying script results
+
+* Top 100 largest tables without partitioning or clustering
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.tables_without_part_clust
+ ORDER BY logical_gigabytes DESC
+ LIMIT 100
+ ```
+
+
+
+🔍 Actively read tables with partitioning and clustering information
+
+## Actively read tables with partitioning and clustering information
+
+> [!IMPORTANT]
+> The [actively_read_tables_with_partitioning_clustering_info.sql](optimization/actively_read_tables_with_partitioning_clustering_info.sql)
+> script depends on the `table_read_patterns` table so you must first run the
+> [table_read_patterns.sql](optimization/table_read_patterns.sql) script.
+
+The [actively_read_tables_with_partitioning_clustering_info.sql](actively_read_tables_with_partitioning_clustering_info.sql)
+script creates a table named, `actively_read_tables_with_part_clust_info`
+that contains a list of actively read tables along with their partitioning and
+clustering information.
+
+### Examples of querying script results
+
+* Top 100 largest **actively read** tables without partitioning or clustering
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.actively_read_tables_with_part_clust_info
+ WHERE clustering_columns IS NULL OR partitioning_column IS NULL
+ ORDER BY logical_gigabytes DESC
+ LIMIT 100
+ ```
+
+
+
+🔍 Tables receiving high quantity of daily DML statements
+
+## Tables receiving high quantity of daily DML statements
+
+The [frequent_daily_table_dml.sql](frequent_daily_table_dml.sql) script creates
+a table named, `frequent_daily_table_dml`, that contains tables that have had
+more than 24 daily DML statements run against them in the past 30 days.
+
+### Examples of querying script results
+
+* Top 100 tables with the most DML statements per table in a day
+
+ ```sql
+ SELECT
+ table_id,
+ table_url,
+ ANY_VALUE(dml_execution_date HAVING MAX daily_dml_per_table) AS sample_dml_execution_date,
+ ANY_VALUE(job_urls[OFFSET(0)] HAVING MAX daily_dml_per_table) AS sample_dml_job_url,
+ MAX(daily_dml_per_table) max_daily_table_dml,
+ FROM optimization_workshop.frequent_daily_table_dml
+ GROUP BY table_id, table_url
+ ORDER BY max_daily_table_dml DESC
+ LIMIT 100;
+ ```
+
+
+
+🔍 Views with non-optimal JOIN conditions
+
+## Views with non-optimal JOIN conditions
+
+The [views_with_nonoptimal_join_condition.sql](views_with_nonoptimal_join_condition.sql)
+script creates a table named, `views_with_nonoptimal_join_condition`, that
+contains views with JOINs where the JOIN condition is potentially non-optimal.
+
+
+
+# Query Analysis
+
+🔍 Queries grouped by hash
+
+## Queries grouped by hash
+
+The [queries_grouped_by_hash.sql](queries_grouped_by_hash.sql) script creates a
+table named,
+`queries_grouped_by_hash`. This table groups queries by their normalized query
+pattern, which ignores
+comments, parameter values, UDFs, and literals in the query text.
+This allows us to group queries that are logically the same, but
+have different literals. The `queries_grouped_by_hash` table does not expose the
+raw SQL text of the queries.
+
+The [viewable_queries_grouped_by_hash.sql](viewable_queries_grouped_by_hash.sql)
+script creates a table named,
+`viewable_queries_grouped_by_hash`. This table is similar to
+the `queries_grouped_by_hash` table, but it
+exposes the raw SQL text of the queries.
+The `viewable_queries_grouped_by_hash.sql` script runs much slower
+in execution than the `queries_grouped_by_hash.sql` script because it has to
+loop over all projects and for each
+project query the `INFORMATION_SCHEMA.JOBS_BY_PROJECT` view.
+
+For example, the following queries would be grouped together because the date
+literal filters are ignored:
+
+```sql
+SELECT * FROM my_table WHERE date = '2020-01-01';
+SELECT * FROM my_table WHERE date = '2020-01-02';
+SELECT * FROM my_table WHERE date = '2020-01-03';
+```
+
+### Examples of querying script results
+
+* Top 100 queries with the highest bytes processed
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.queries_grouped_by_hash
+ ORDER BY total_gigabytes_processed DESC
+ LIMIT 100
+ ```
+
+* Top 100 recurring queries with the highest slot hours consumed
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.queries_grouped_by_hash
+ ORDER BY total_slot_hours * days_active * job_count DESC
+ LIMIT 100
+ ```
+
+
+
+🔍 Queries grouped by script
+
+## Queries grouped by script
+
+The [queries_grouped_by_script.sql](queries_grouped_by_script.sql) script
+creates a table named,
+`queries_grouped_by_script`. This table groups queries by their parent job id
+which means that any queries that
+were executed as part of a script (multi-statement query) will be grouped
+together. This table is helpful if you
+want to see which particular scripts are most inefficient.
+
+### Examples of querying script results
+
+* Top 100 scripts with the highest bytes processed
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.queries_grouped_by_script
+ ORDER BY total_gigabytes_processed DESC
+ LIMIT 100
+ ```
+
+* Top 100 scripts with the highest slot hours consumed
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.queries_grouped_by_script
+ ORDER BY total_slot_hours DESC
+ LIMIT 100
+ ```
+
+* Top 100 scripts with the highest slot hours consumed that doesn't include
+ INFO_SCHEMA views
+
+ ```sql
+ SELECT *
+ FROM optimization_workshop.queries_grouped_by_script
+ WHERE NOT EXISTS(
+ SELECT 1
+ FROM UNNEST(referenced_tables) table
+ WHERE table LIKE "INFORMATION_SCHEMA%"
+ )
+ ORDER BY total_slots DESC
+ LIMIT 100
+ ```
+
+
+
+🔍 Hourly slot consumption by query hash
+
+## Hourly slot consumption by query hash
+
+The [hourly_slot_consumption_by_query_hash](hourly_slot_consumption_by_query_hash.sql) script
+creates a table named,
+`hourly_slot_consumption_by_query_hash`. This table groups queries by their query hash and then shows
+for every hour timeslice how much of that hour's slots each query hash consumed.
+
+
+
+🔍 Queries with performance insights
+
+## Queries with performance insights
+
+The [query_performance_insights.sql](query_performance_insights.sql) script
+creates a table named, `query_performance_insights` retrieves all queries that
+have had performance insights
+generated for them in the past 30 days.
+
+### Examples of querying script results
+
+* Top 100 queries with most # of performance insights
+
+ ```sql
+ SELECT
+ job_url,
+ (SELECT COUNT(1)
+ FROM UNNEST(performance_insights.stage_performance_standalone_insights) perf_insight
+ WHERE perf_insight.slot_contention
+ ) AS num_stages_with_slot_contention,
+ (SELECT COUNT(1)
+ FROM UNNEST(performance_insights.stage_performance_standalone_insights) perf_insight
+ WHERE perf_insight.insufficient_shuffle_quota
+ ) AS num_stages_with_insufficient_shuffle_quota,
+ (SELECT ARRAY_AGG(perf_insight.input_data_change.records_read_diff_percentage IGNORE NULLS)
+ FROM UNNEST(performance_insights.stage_performance_change_insights) perf_insight
+ ) AS records_read_diff_percentages
+ FROM optimization_workshop.query_performance_insights
+ ORDER BY (
+ num_stages_with_slot_contention
+ + num_stages_with_insufficient_shuffle_quota
+ + ARRAY_LENGTH(records_read_diff_percentages)
+ ) DESC
+ LIMIT 100
+ ```
+
+
diff --git a/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql b/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql
new file mode 100644
index 000000000..662194d8d
--- /dev/null
+++ b/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, actively_read_tables_with_part_clust_info,
+ * that contains a list of the most frequently read tables which are:
+ * - not partitioned
+ * - not clustered
+ * - neither partitioned nor clustered
+ */
+
+DECLARE projects ARRAY DEFAULT (
+ SELECT ARRAY_AGG(DISTINCT project_id)
+ FROM optimization_workshop.table_read_patterns
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info
+(
+ project_id STRING,
+ dataset_id STRING,
+ table_id STRING,
+ total_slot_ms FLOAT64,
+ total_jobs INT64,
+ max_slot_ms_job_url STRING,
+ num_days_queried INT64,
+ predicate_columns STRING,
+ partitioning_column STRING,
+ clustering_columns STRING,
+ table_url STRING,
+ logical_gigabytes FlOAT64,
+ logical_terabytes FLOAT64
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+ UNNEST(projects) project_id
+)
+DO
+BEGIN
+ EXECUTE IMMEDIATE FORMAT("""
+ INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info
+ SELECT
+ rp.* EXCEPT(predicates),
+ ARRAY_TO_STRING(ARRAY_CONCAT_AGG((
+ SELECT
+ ARRAY_AGG(predicate_column_counts.column || ':' || predicate_column_counts.cnt)
+ FROM (
+ SELECT
+ STRUCT(predicate.column, COUNT(predicate.column) AS cnt) AS predicate_column_counts
+ FROM UNNEST(predicates) predicate
+ GROUP BY predicate.column
+ ))), ', ') AS predicate_columns,
+ partitioning_column,
+ clustering_columns,
+ bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url,
+ SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
+ SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
+ FROM
+ `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
+ JOIN (
+ SELECT
+ project_id,
+ dataset_id,
+ table_id,
+ ARRAY_CONCAT_AGG(predicates) AS predicates,
+ SUM(stage_slot_ms) AS total_slot_ms,
+ COUNT(DISTINCT job_id) AS total_jobs,
+ ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url,
+ COUNT(DISTINCT date) AS num_days_queried,
+ FROM optimization_workshop.table_read_patterns
+ GROUP BY
+ project_id,
+ dataset_id,
+ table_id
+ ) rp
+ ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id)
+ JOIN (
+ SELECT
+ table_catalog,
+ table_schema,
+ table_name,
+ STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
+ STRING_AGG(
+ CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
+ ORDER BY CLUSTERING_ORDINAL_POSITION
+ ) AS clustering_columns
+ FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
+ GROUP BY 1,2,3
+ ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
+ GROUP BY
+ project_id,
+ dataset_id,
+ table_id,
+ table_url,
+ total_slot_ms,
+ total_jobs,
+ max_slot_ms_job_url,
+ num_days_queried,
+ partitioning_column,
+ clustering_columns,
+ total_slot_ms;
+ """,
+ p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql b/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql
new file mode 100644
index 000000000..bd2fe1370
--- /dev/null
+++ b/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+DECLARE projects ARRAY DEFAULT (
+ SELECT ARRAY_AGG(DISTINCT project_id)
+ FROM optimization_workshop.table_read_patterns
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info
+(
+ project_id STRING,
+ dataset_id STRING,
+ table_id STRING,
+ total_slot_ms FLOAT64,
+ total_jobs INT64,
+ max_slot_ms_job_url STRING,
+ num_days_queried INT64,
+ partitioning_column STRING,
+ clustering_columns STRING,
+ table_url STRING,
+ logical_gigabytes FlOAT64,
+ logical_terabytes FLOAT64
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+ UNNEST(projects) project_id
+)
+DO
+BEGIN
+ EXECUTE IMMEDIATE FORMAT("""
+ INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info
+ SELECT
+ rp.*,
+ partitioning_column,
+ clustering_columns,
+ bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url,
+ SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
+ SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
+ FROM
+ `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
+ JOIN (
+ SELECT
+ project_id,
+ dataset_id,
+ table_id,
+ SUM(total_slot_ms) AS total_slot_ms,
+ COUNT(DISTINCT job_id) AS total_jobs,
+ ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url,
+ COUNT(DISTINCT date) AS num_days_queried,
+ FROM optimization_workshop.table_read_patterns
+ GROUP BY
+ project_id,
+ dataset_id,
+ table_id
+ ) rp
+ ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id)
+ JOIN (
+ SELECT
+ table_catalog,
+ table_schema,
+ table_name,
+ STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
+ STRING_AGG(
+ CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
+ ORDER BY CLUSTERING_ORDINAL_POSITION
+ ) AS clustering_columns
+ FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
+ GROUP BY 1,2,3
+ ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
+ GROUP BY
+ project_id,
+ dataset_id,
+ table_id,
+ table_url,
+ total_slot_ms,
+ total_jobs,
+ max_slot_ms_job_url,
+ num_days_queried,
+ partitioning_column,
+ clustering_columns,
+ total_slot_ms;
+ """,
+ p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
\ No newline at end of file
diff --git a/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql b/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql
new file mode 100644
index 000000000..659ec4633
--- /dev/null
+++ b/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The following script creates a table named, table_read_patterns,
+ * that contains a list of the most frequently read tables within the
+ * past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns
+CLUSTER BY project_id, dataset_id, table_id AS
+SELECT
+ DATE(creation_time) AS date,
+ jbo.project_id,
+ table.dataset_id,
+ table.table_id,
+ table.project_id || '.' || table.dataset_id || '.' || table.table_id AS full_table_id,
+ job_id,
+ bqutil.fn.job_url(jbo.project_id || ':us.' || job_id) AS job_url,
+ parent_job_id,
+ bqutil.fn.job_url(jbo.project_id || ':us.' || parent_job_id) AS parent_job_url,
+ reservation_id,
+ total_bytes_billed,
+ total_slot_ms,
+ creation_time,
+ start_time,
+ end_time
+FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo,
+UNNEST(referenced_tables) table
+WHERE
+ DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND (
+ table.project_id||table.dataset_id||table.table_id
+ <> destination_table.project_id||destination_table.dataset_id||destination_table.table_id
+ )
+ AND job_type = 'QUERY'
+ AND statement_type != 'SCRIPT'
+ AND NOT cache_hit
+ AND error_result IS NULL;
diff --git a/scripts/optimization/daily_project_analysis.sql b/scripts/optimization/daily_project_analysis.sql
new file mode 100644
index 000000000..4b30a848b
--- /dev/null
+++ b/scripts/optimization/daily_project_analysis.sql
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table that contains daily slot consumption information
+ * about BigQuery jobs for a given set of projects within the past 30 days.
+ *
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.daily_project_analysis AS
+SELECT
+ day,
+ project_id,
+ COUNT(*) AS job_count,
+ AVG(total_slot_ms)/1000 AS avg_total_slot_secs,
+ MAX(median_total_slot_ms)/1000 AS median_total_slot_secs,
+ MAX(p80_total_slot_ms)/1000 AS p80_total_slot_secs,
+ SUM(total_slot_ms)/1000/60/60 AS total_slot_hours,
+ AVG(time_secs) AS avg_time_secs,
+ MAX(median_time_secs) AS median_time_secs,
+ SUM(time_secs)/60/60 AS total_time_hours,
+ MAX(p80_time_secs) AS p80_time_secs,
+ AVG(bytes_scanned)/POW(1024,3) AS avg_gb_scanned,
+ MAX(p80_bytes_scanned)/POW(1024,3) AS p80_gb_scanned,
+ SUM(bytes_scanned)/POW(1024,4) AS total_tb_scanned,
+ AVG(bytes_shuffled)/POW(1024,3) AS avg_gb_shuffled,
+ MAX(p80_bytes_shuffled)/POW(1024,3) AS p80_gb_shuffled,
+ SUM(bytes_shuffled)/POW(1024,4) AS total_tb_shuffled,
+ AVG(bytes_spilled)/POW(1024,3) AS avg_gb_spilled,
+ MAX(p80_bytes_spilled)/POW(1024,3) AS p80_gb_spilled,
+ SUM(bytes_spilled)/POW(1024,4) AS total_tb_spilled,
+FROM(
+ SELECT
+ day,
+ project_id,
+ total_slot_ms,
+ PERCENTILE_CONT(total_slot_ms, 0.5)
+ OVER (PARTITION BY day, project_id) AS median_total_slot_ms,
+ PERCENTILE_CONT(total_slot_ms, 0.8)
+ OVER (PARTITION BY day, project_id) AS p80_total_slot_ms,
+ time_secs,
+ PERCENTILE_CONT(time_secs, 0.5)
+ OVER (PARTITION BY day, project_id) AS median_time_secs,
+ PERCENTILE_CONT(time_secs, 0.8)
+ OVER (PARTITION BY day, project_id) AS p80_time_secs,
+ total_bytes_scanned bytes_scanned,
+ PERCENTILE_CONT(total_bytes_scanned, 0.8)
+ OVER (PARTITION BY day, project_id) AS p80_bytes_scanned,
+ bytes_shuffled,
+ PERCENTILE_CONT(bytes_shuffled, 0.8)
+ OVER (PARTITION BY day, project_id) AS p80_bytes_shuffled,
+ bytes_spilled,
+ PERCENTILE_CONT(bytes_spilled, 0.8)
+ OVER (PARTITION BY day, project_id) AS p80_bytes_spilled
+ FROM(
+ SELECT
+ DATE(jbo.creation_time) AS day,
+ project_id,
+ job_id,
+ total_slot_ms,
+ TIMESTAMP_DIFF(jbo.end_time,jbo.start_time, SECOND) AS time_secs,
+ total_bytes_billed total_bytes_scanned,
+ (SELECT SUM(stage.shuffle_output_bytes) FROM UNNEST(job_stages) stage) AS bytes_shuffled,
+ (SELECT SUM(stage.shuffle_output_bytes_spilled) FROM UNNEST(job_stages) stage) AS bytes_spilled
+ FROM
+ `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo
+ WHERE
+ DATE(jbo.creation_time) >= CURRENT_DATE - num_days_to_scan
+ -- Uncomment below to specify a list of projects to analyze
+ -- AND jbo.project_id IN ()
+ AND jbo.job_type = 'QUERY'
+ AND jbo.end_time > jbo.start_time
+ AND jbo.error_result IS NULL
+ AND jbo.statement_type != 'SCRIPT'
+))
+GROUP BY 1, 2;
diff --git a/scripts/optimization/enable_recommender_api.sh b/scripts/optimization/enable_recommender_api.sh
new file mode 100644
index 000000000..6aeca2742
--- /dev/null
+++ b/scripts/optimization/enable_recommender_api.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The following script retrieves all distinct projects from the JOBS_BY_ORGANIZATION view
+# and then enables the recommender API for each project.
+# This is useful for when you have a large number of projects and you want to enable the
+# recommender API for all of them.
+projects=$(
+ bq query \
+ --nouse_legacy_sql \
+ --format=csv \
+ "SELECT DISTINCT project_id FROM \`region-us\`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION" |
+ sed 1d
+)
+for proj in $projects; do
+ gcloud services --project="${proj}" enable recommender.googleapis.com &
+done
diff --git a/scripts/optimization/frequent_daily_table_dml.sql b/scripts/optimization/frequent_daily_table_dml.sql
new file mode 100644
index 000000000..897424598
--- /dev/null
+++ b/scripts/optimization/frequent_daily_table_dml.sql
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The following script will return tables that have had > 24 DML statements
+ * run against in any one day within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.frequent_daily_table_dml AS
+SELECT
+ destination_table.project_id,
+ destination_table.dataset_id,
+ destination_table.table_id,
+ EXTRACT(DATE FROM creation_time) AS dml_execution_date,
+ bqutil.fn.table_url(destination_table.project_id || '.' || destination_table.dataset_id || '.' || destination_table.table_id) AS table_url,
+ COUNT(1) AS daily_dml_per_table,
+ ARRAY_AGG(job_id) AS job_ids,
+ ARRAY_AGG(bqutil.fn.job_url(project_id || ':us.' || job_id) IGNORE NULLS) AS job_urls,
+ ARRAY_AGG(DISTINCT statement_type) AS statement_types,
+ SUM(SAFE_DIVIDE(total_bytes_processed, pow(2,30))) AS sum_total_gb_processed,
+ AVG(SAFE_DIVIDE(total_bytes_processed, pow(2,30))) AS avg_total_gb_processed,
+ SUM(total_slot_ms) AS sum_total_slot_ms,
+ AVG(total_slot_ms) AS avg_total_slot_ms,
+ SUM(SAFE_DIVIDE(total_slot_ms, (TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)))) AS sum_avg_slots,
+ AVG(SAFE_DIVIDE(total_slot_ms, (TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)))) AS avg_avg_slots,
+FROM
+ `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+ -- Look at the past 30 days of jobs
+ DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ -- Only look at DML statements
+ AND statement_type IN ('INSERT', 'UPDATE', 'DELETE', 'MERGE')
+ GROUP BY 1,2,3,4,5
+ HAVING daily_dml_per_table > 24;
diff --git a/scripts/optimization/hourly_slot_consumption_by_query_hash.sql b/scripts/optimization/hourly_slot_consumption_by_query_hash.sql
new file mode 100644
index 000000000..337e5e2b9
--- /dev/null
+++ b/scripts/optimization/hourly_slot_consumption_by_query_hash.sql
@@ -0,0 +1,128 @@
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+DECLARE my_reservation_id STRING DEFAULT "your_reservation_id";
+
+
+CREATE OR REPLACE TABLE `optimization_workshop.hourly_slot_consumption_by_query_hash` AS
+SELECT * EXCEPT(jobs_per_hour),
+-- Get the top 10 query patterns by slot hours
+ARRAY(
+ SELECT AS STRUCT
+ -- since period is hourly, period_slot_hours/(1hr duration) = period_num_slots
+ ROUND(period_slot_hours, 2) AS num_period_slots,
+ ROUND(period_slot_hours / period_total_slot_hours * 100, 2) || "%" AS pct_of_total_period,
+ COALESCE(slot_hunger,0) AS slot_hunger,
+ ROUND(avg_period_estimated_runnable_units,2) AS avg_period_estimated_runnable_units,
+ ROUND(max_duration_minutes, 2) AS max_duration_minutes,
+ ROUND(max_p90_period_slots, 2) AS max_p90_period_slots,
+ ROUND(p90_avg_job_period_slots, 2) AS p90_avg_job_period_slots,
+ ROUND(p90_duration_minutes, 2) AS p90_duration_minutes,
+ query_hash,
+ num_days_query_hash_active,
+ labels,
+ num_jobs_per_period,
+ job_url,
+ parent_job_url,
+ FROM(
+ SELECT
+ query_hash,
+ ANY_VALUE(num_days_query_hash_active) AS num_days_query_hash_active,
+ ARRAY_AGG(DISTINCT FORMAT("%T", labels)) AS labels,
+ MAX(duration_ms)/1000/60 AS max_duration_minutes,
+ SUM(slot_hunger) AS slot_hunger,
+ AVG(avg_period_estimated_runnable_units) AS avg_period_estimated_runnable_units,
+ MAX(p90_job_period_slots) AS max_p90_period_slots,
+ ANY_VALUE(p90_avg_job_period_slots) AS p90_avg_job_period_slots,
+ ANY_VALUE(p90_duration_minutes) AS p90_duration_minutes,
+ SUM(period_slot_hours) AS period_slot_hours,
+ COUNT(1) AS num_jobs_per_period,
+ ANY_VALUE(job_url HAVING MAX duration_ms) AS job_url,
+ ANY_VALUE(parent_job_url HAVING MAX duration_ms) AS parent_job_url,
+ ANY_VALUE(user_email HAVING MAX duration_ms) AS user_email,
+ FROM (
+ SELECT *,
+ PERCENTILE_CONT(duration_ms, 0.9) OVER(
+ PARTITION BY query_hash
+ )/1000/60 AS p90_duration_minutes,
+ PERCENTILE_CONT(avg_job_period_slots, 0.9) OVER(
+ PARTITION BY query_hash
+ ) AS p90_avg_job_period_slots,
+ FROM UNNEST(jobs_per_hour)
+ )
+ GROUP BY 1
+ ORDER BY period_slot_hours DESC
+ LIMIT 10
+ )
+) AS top_job_patterns_per_hour
+FROM(
+ SELECT
+ FORMAT_TIMESTAMP("%F %H", period_start_hour, "America/New_York") AS period_start_hour,
+ COUNT(DISTINCT job_id) AS num_jobs_in_period,
+ SUM(period_slot_ms)/1000/60/60 AS period_total_slot_hours,
+ MAX(p90_job_period_slots) AS max_p90_job_period_slots,
+ ARRAY_AGG(STRUCT(
+ period_slot_ms/1000/60/60 AS period_slot_hours,
+ p90_job_period_slots,
+ avg_job_period_slots,
+ duration_ms,
+ avg_period_estimated_runnable_units,
+ slot_hunger,
+ query_hash,
+ num_days_query_hash_active,
+ labels,
+ bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+ COALESCE(bqutil.fn.job_url(project_id || ':us.' || parent_job_id), '') AS parent_job_url,
+ user_email
+ )) AS jobs_per_hour,
+ FROM(
+ SELECT
+ TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York") AS period_start_hour,
+ project_id,
+ parent_job_id,
+ job_id,
+ user_email,
+ AVG(COALESCE(period_estimated_runnable_units,0)) AS avg_period_estimated_runnable_units,
+ SUM(IF(COALESCE(period_estimated_runnable_units,0) >= 1000,1,0)) AS slot_hunger,
+ SUM(period_slot_ms) AS period_slot_ms,
+ ANY_VALUE(p90_job_period_slots) AS p90_job_period_slots,
+ ANY_VALUE(avg_job_period_slots) AS avg_job_period_slots,
+ FROM(
+ SELECT *,
+ PERCENTILE_CONT(period_slot_ms/1000, 0.9) OVER(
+ PARTITION BY TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York"), job_id
+ ) AS p90_job_period_slots,
+ AVG(period_slot_ms/1000) OVER(
+ PARTITION BY TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York"), job_id
+ ) AS avg_job_period_slots,
+
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION
+ WHERE
+ DATE(job_creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND reservation_id = my_reservation_id
+ AND state IN ('RUNNING','DONE')
+ AND statement_type != 'SCRIPT'
+ AND period_slot_ms > 0
+ -- Uncomment below to ignore slots consumed by failed queries
+ -- AND error_result IS NULL
+ )
+ GROUP BY 1,2,3,4,5
+ )
+ JOIN (
+ SELECT
+ job_id,
+ query_info.query_hashes.normalized_literals AS query_hash,
+ labels,
+ TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS duration_ms,
+ COUNT(DISTINCT DATE(creation_time)) OVER(
+ PARTITION BY query_info.query_hashes.normalized_literals
+ ) AS num_days_query_hash_active,
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+ WHERE
+ DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND reservation_id = my_reservation_id
+ AND statement_type != 'SCRIPT'
+ AND total_slot_ms > 0
+ -- Uncomment below to ignore slots consumed by failed queries
+ -- AND error_result IS NULL
+ ) USING(job_id)
+ GROUP BY period_start_hour
+);
\ No newline at end of file
diff --git a/scripts/optimization/queries_grouped_by_hash.sql b/scripts/optimization/queries_grouped_by_hash.sql
new file mode 100644
index 000000000..e4bbfdce0
--- /dev/null
+++ b/scripts/optimization/queries_grouped_by_hash.sql
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, top_bytes_scanning_queries_by_hash,
+ * which contains the top 200 most expensive queries by total bytes scanned
+ * within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ * Queries are grouped by their normalized query pattern, which ignores
+ * comments, parameter values, UDFs, and literals in the query text.
+ * This allows us to group queries that are logically the same, but
+ * have different literals.
+ *
+ * For example, the following queries would be grouped together:
+ * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01'
+ * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02'
+ * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03'
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS (
+ COALESCE((
+ SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0))
+ FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0)
+ + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0)
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash AS
+SELECT
+ statement_type,
+ query_info.query_hashes.normalized_literals AS query_hash,
+ COUNT(DISTINCT DATE(start_time)) AS days_active,
+ ARRAY_AGG(DISTINCT project_id IGNORE NULLS) AS project_ids,
+ ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids,
+ SUM(num_stages_with_perf_insights(query_info)) AS num_stages_with_perf_insights,
+ COUNT(DISTINCT (project_id || ':us.' || job_id)) AS job_count,
+ ARRAY_AGG(
+ STRUCT(
+ bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+ bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url
+ )
+ ORDER BY total_slot_ms
+ DESC LIMIT 10) AS top_10_job_urls,
+ ARRAY_AGG(DISTINCT user_email) AS user_emails,
+ SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed,
+ AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed,
+ SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours,
+ AVG(total_slot_ms) / (1000 * 60 * 60) AS avg_total_slot_hours_per_active_day,
+ AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds,
+ ARRAY_AGG(DISTINCT FORMAT("%T",labels)) AS labels,
+ SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots,
+ AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots,
+ -- query hashes will all have the same referenced tables so we can use ANY_VALUE below
+ ANY_VALUE(ARRAY(
+ SELECT
+ ref_table.project_id || '.' ||
+ IF(STARTS_WITH(ref_table.dataset_id, '_'), 'TEMP', ref_table.dataset_id)
+ || '.' || ref_table.table_id
+ FROM UNNEST(referenced_tables) ref_table
+ )) AS referenced_tables,
+FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+ DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND state = 'DONE'
+ AND error_result IS NULL
+ AND job_type = 'QUERY'
+ AND statement_type != 'SCRIPT'
+GROUP BY statement_type, query_hash;
diff --git a/scripts/optimization/queries_grouped_by_labels.sql b/scripts/optimization/queries_grouped_by_labels.sql
new file mode 100644
index 000000000..77d5514cb
--- /dev/null
+++ b/scripts/optimization/queries_grouped_by_labels.sql
@@ -0,0 +1,41 @@
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS (
+ COALESCE((
+ SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0))
+ FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0)
+ + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0)
+);
+
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_labels AS
+SELECT
+ FORMAT("%T", labels) AS labels,
+ COUNT(DISTINCT DATE(start_time)) AS days_active,
+ ARRAY_AGG(DISTINCT project_id IGNORE NULLS) AS project_ids,
+ ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids,
+ SUM(num_stages_with_perf_insights(query_info)) AS num_stages_with_perf_insights,
+ COUNT(DISTINCT (project_id || ':us.' || job_id)) AS job_count,
+ ARRAY_AGG(
+ STRUCT(
+ bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+ bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url
+ )
+ ORDER BY total_slot_ms
+ DESC LIMIT 10) AS top_10_job_urls,
+ ARRAY_AGG(DISTINCT user_email) AS user_emails,
+ SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed,
+ AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed,
+ SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours,
+ SUM(total_slot_ms) / (1000 * 60 * 60) / COUNT(DISTINCT DATE(start_time)) AS avg_total_slot_hours_per_active_day,
+ AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds,
+ SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots,
+ AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots,
+FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+ DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND state = 'DONE'
+ AND error_result IS NULL
+ AND job_type = 'QUERY'
+ AND statement_type != 'SCRIPT'
+ AND user_email LIKE '%gserviceaccount.com'
+GROUP BY labels;
\ No newline at end of file
diff --git a/scripts/optimization/queries_grouped_by_script.sql b/scripts/optimization/queries_grouped_by_script.sql
new file mode 100644
index 000000000..e7803cff8
--- /dev/null
+++ b/scripts/optimization/queries_grouped_by_script.sql
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION job_stage_max_slots(job_stages ANY TYPE) AS ((
+ SELECT MAX(SAFE_DIVIDE(stage.slot_ms,stage.end_ms - stage.start_ms))
+ FROM UNNEST(job_stages) stage
+));
+CREATE TEMP FUNCTION total_bytes_shuffled(job_stages ANY TYPE) AS ((
+ SELECT SUM(stage.shuffle_output_bytes)
+ FROM UNNEST(job_stages) stage
+));
+CREATE TEMP FUNCTION total_shuffle_bytes_spilled(job_stages ANY TYPE) AS ((
+ SELECT SUM(stage.shuffle_output_bytes_spilled)
+ FROM UNNEST(job_stages) stage
+));
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_script AS
+SELECT * REPLACE((
+ SELECT
+ ARRAY_AGG(DISTINCT table.project_id || "." || table.dataset_id || "." || table.table_id)
+ FROM UNNEST(referenced_tables) table
+ ) AS referenced_tables)
+FROM(
+ SELECT
+ bqutil.fn.job_url(
+ parent.project_id || ':us.' || parent.job_id) AS job_url,
+ parent.user_email,
+ parent.start_time,
+ ANY_VALUE(parent.labels) AS labels,
+ SUM(COALESCE(SAFE_DIVIDE(child.total_slot_ms, TIMESTAMP_DIFF(
+ child.end_time, child.start_time, MILLISECOND)), 0)) AS total_slots,
+ SUM(COALESCE(child.total_slot_ms, 0)) AS total_slot_ms,
+ SUM(COALESCE(child.total_slot_ms / (1000 * 60 * 60), 0)) AS total_slot_hours,
+ SUM(COALESCE(child.total_bytes_processed, 0)) / POW(1024, 3) AS total_gigabytes_processed,
+ SUM(COALESCE(child.total_bytes_processed, 0)) / POW(1024, 4) AS total_terabytes_processed,
+ ARRAY_CONCAT_AGG(child.referenced_tables) AS referenced_tables,
+ ARRAY_AGG(STRUCT(
+ bqutil.fn.job_url(
+ child.project_id || ':us.' || child.job_id) AS job_url,
+ child.reservation_id AS reservation_id,
+ EXTRACT(DATE FROM child.creation_time) AS creation_date,
+ TIMESTAMP_DIFF(child.end_time, child.start_time, SECOND) AS job_duration_seconds,
+ child.total_slot_ms AS total_slot_ms,
+ SAFE_DIVIDE(child.total_slot_ms,TIMESTAMP_DIFF(
+ child.end_time, child.start_time, MILLISECOND)) AS job_avg_slots,
+ job_stage_max_slots(child.job_stages) AS job_stage_max_slots,
+ total_bytes_shuffled(child.job_stages) AS total_bytes_shuffled,
+ total_shuffle_bytes_spilled(child.job_stages) AS total_shuffle_bytes_spilled)
+ ORDER BY child.start_time
+ ) AS children_jobs_details
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION AS parent
+ JOIN `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION AS child
+ ON parent.job_id = child.parent_job_id
+ WHERE
+ DATE(parent.creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND parent.state = 'DONE'
+ AND parent.error_result IS NULL
+ GROUP BY job_url, user_email, start_time
+ HAVING
+ ARRAY_LENGTH(children_jobs_details) > 1
+ AND total_slot_ms > 0);
diff --git a/scripts/optimization/query_performance_insights.sql b/scripts/optimization/query_performance_insights.sql
new file mode 100644
index 000000000..1e0218893
--- /dev/null
+++ b/scripts/optimization/query_performance_insights.sql
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script retrieves the top 100 queries that have had performance insights
+ * generated for them in the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.query_performance_insights AS
+SELECT
+ bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+ query_info.performance_insights AS performance_insights
+FROM
+ `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+ DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ AND job_type = 'QUERY'
+ AND state = 'DONE'
+ AND error_result IS NULL
+ AND statement_type != 'SCRIPT'
+ AND EXISTS ( -- Only include queries which had performance insights
+ SELECT 1
+ FROM UNNEST(
+ query_info.performance_insights.stage_performance_standalone_insights
+ )
+ WHERE slot_contention OR insufficient_shuffle_quota
+ UNION ALL
+ SELECT 1
+ FROM UNNEST(
+ query_info.performance_insights.stage_performance_change_insights
+ )
+ WHERE input_data_change.records_read_diff_percentage IS NOT NULL
+ );
diff --git a/scripts/optimization/run_all_scripts.sh b/scripts/optimization/run_all_scripts.sh
new file mode 100644
index 000000000..596dca24a
--- /dev/null
+++ b/scripts/optimization/run_all_scripts.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+# Set the following flags for the bq command:
+# --quiet: suppress status updates while jobs are running
+# --nouse_legacy_sql: use standard SQL syntax
+# --nouse_cache: do not use cached results
+bq_flags="--quiet --nouse_legacy_sql --nouse_cache"
+
+# Run all the .sql files in the current directory in parallel,
+# except for table_read_patterns.sql
+# and actively_read_tables_with_partitioning_clustering_info.sql
+# since they'll be run sequentially due to the depedency between them.
+for f in *.sql; do
+ if [[ $f = "table_read_patterns.sql" ||
+ $f = "actively_read_tables_with_partitioning_clustering_info.sql" ]]; then
+ # Skip this file, it's already been run
+ continue
+ fi
+ bq query ${bq_flags} < $f &
+done
+
+# Run the table_read_patterns.sql file first because it's a dependency for
+# actively_read_tables_with_partitioning_clustering_info.sql
+bq query ${bq_flags} DEFAULT (
+ SELECT ARRAY_AGG(DISTINCT project_id)
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+ WHERE DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+);
+
+/*
+-- Uncomment this block if you just want to scan the top 1000 projects
+-- by total bytes billed in the past 30 days.
+DECLARE projects ARRAY DEFAULT (
+ SELECT ARRAY_AGG(project_id)
+ FROM(
+ SELECT project_id
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+ WHERE DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+ GROUP BY 1
+ ORDER BY SUM(total_bytes_billed) DESC
+ LIMIT 1000
+));
+*/
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns
+(
+ date DATE,
+ project_id STRING,
+ dataset_id STRING,
+ table_id STRING,
+ full_table_id STRING,
+ job_id STRING,
+ job_url STRING,
+ parent_job_id STRING,
+ parent_job_url STRING,
+ reservation_id STRING,
+ total_bytes_billed INT64,
+ total_slot_ms INT64,
+ creation_time TIMESTAMP,
+ start_time TIMESTAMP,
+ end_time TIMESTAMP,
+ stage_name STRING,
+ stage_id INT64,
+ stage_slot_ms INT64,
+ total_job_read_slot_ms INT64,
+ records_read INT64,
+ records_written INT64,
+ shuffle_output_bytes INT64,
+ shuffle_output_bytes_spilled INT64,
+ parallel_inputs INT64,
+ read_ratio_avg FLOAT64,
+ read_ms_avg INT64,
+ wait_ratio_avg FLOAT64,
+ wait_ms_avg INT64,
+ compute_ratio_avg FLOAT64,
+ compute_ms_avg INT64,
+ write_ratio_avg FLOAT64,
+ write_ms_avg INT64,
+ predicates ARRAY>
+) CLUSTER BY project_id, dataset_id, table_id;
+
+CREATE TEMP FUNCTION mapColumns(where_clause STRING, column_list ARRAY)
+RETURNS STRING
+LANGUAGE js AS r"""
+ if (!where_clause) {
+ return null;
+ }
+ column_list.forEach(function(col) {
+ const tokens = col.split(":");
+ where_clause = where_clause.replaceAll(tokens[0].trim(), tokens[1]);
+ });
+ return where_clause;
+""";
+
+FOR p IN (
+ SELECT project_id
+ FROM
+ UNNEST(projects) project_id
+)
+DO
+BEGIN
+EXECUTE IMMEDIATE FORMAT("""
+INSERT INTO optimization_workshop.table_read_patterns
+ SELECT
+ DATE(creation_time) AS date,
+ project_id,
+ IF(ARRAY_LENGTH(SPLIT(table_id, '.'))=2, SPLIT(table_id, '.')[OFFSET(0)], SPLIT(table_id, '.')[OFFSET(1)]) AS dataset_id,
+ SPLIT(table_id, '.')[ORDINAL(ARRAY_LENGTH(SPLIT(table_id, '.')))] AS table_id,
+ IF(ARRAY_LENGTH(SPLIT(table_id, '.'))=2, project_id || '.' || table_id, table_id) AS full_table_id,
+ job_id,
+ bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+ parent_job_id,
+ bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+ reservation_id,
+ total_bytes_billed,
+ total_slot_ms,
+ creation_time,
+ start_time,
+ end_time,
+ stage_name,
+ stage_id,
+ stage_slot_ms,
+ total_job_read_slot_ms,
+ records_read,
+ records_written,
+ shuffle_output_bytes,
+ shuffle_output_bytes_spilled,
+ parallel_inputs,
+ read_ratio_avg,
+ read_ms_avg,
+ wait_ratio_avg,
+ wait_ms_avg,
+ compute_ratio_avg,
+ compute_ms_avg,
+ write_ratio_avg,
+ write_ms_avg,
+ ARRAY(
+ SELECT STRUCT(
+ REGEXP_EXTRACT(predicate, '^[[:word:]]+') AS operator,
+ REGEXP_EXTRACT(predicate, '[(]([[:word:]]+)') AS column,
+ REGEXP_EXTRACT(predicate, '[,](.+)[)]') AS value )
+ FROM UNNEST(filters) AS predicate
+ ) AS predicates
+ FROM (
+ SELECT *,
+ REGEXP_EXTRACT_ALL(
+ mapcolumns(where_clause, projection_list),
+ '[[:word:]]+[(][^()]*?[)]') AS filters
+ FROM (
+ SELECT
+ jbp.project_id,
+ job_id,
+ parent_job_id,
+ reservation_id,
+ total_bytes_billed,
+ total_slot_ms,
+ creation_time,
+ start_time,
+ end_time,
+ js.name AS stage_name,
+ js.id AS stage_id,
+ SUM(js.slot_ms) OVER (PARTITION BY job_id) AS total_job_read_slot_ms,
+ js.slot_ms AS stage_slot_ms,
+ js.records_read,
+ js.records_written,
+ js.shuffle_output_bytes,
+ js.shuffle_output_bytes_spilled,
+ js.parallel_inputs,
+ js.read_ratio_avg,
+ js.read_ms_avg,
+ js.wait_ratio_avg,
+ js.wait_ms_avg,
+ js.compute_ratio_avg,
+ js.compute_ms_avg,
+ js.write_ratio_avg,
+ js.write_ms_avg,
+ SPLIT(js_steps.substeps[safe_OFFSET(0)], ',') AS projection_list,
+ REPLACE(js_steps.substeps[safe_OFFSET(1)],'FROM ', '') AS table_id,
+ js_steps.substeps[safe_OFFSET(2)] AS where_clause
+ FROM `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT jbp
+ JOIN UNNEST(job_stages) AS js
+ JOIN UNNEST(steps) AS js_steps
+ WHERE
+ DATE(creation_time) >= CURRENT_DATE - %i
+ AND js_steps.kind = 'READ'
+ AND jbp.job_type = 'QUERY'
+ AND jbp.statement_type != 'SCRIPT'
+ AND NOT cache_hit
+ AND error_result IS NULL
+ AND NOT EXISTS ( -- Exclude queries over INFORMATION_SCHEMA
+ SELECT 1
+ FROM UNNEST(js_steps.substeps) AS substeps
+ WHERE substeps LIKE 'FROM %%.INFORMATION_SCHEMA.%%')
+ AND EXISTS ( -- Only include substeps with a FROM clause
+ SELECT 1
+ FROM UNNEST(js_steps.substeps) AS substeps
+ WHERE substeps LIKE 'FROM %%.%%')
+))""", p.project_id, num_days_to_scan);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/table_read_patterns_by_query_hash.sql b/scripts/optimization/table_read_patterns_by_query_hash.sql
new file mode 100644
index 000000000..dd69ec428
--- /dev/null
+++ b/scripts/optimization/table_read_patterns_by_query_hash.sql
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, table_read_patterns_by_query_hash
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns_by_query_hash
+CLUSTER BY project_id, dataset_id, table_id AS
+SELECT
+ project_id,
+ dataset_id,
+ table_id,
+ COUNT(query_pattern) AS num_query_patterns,
+ ARRAY_AGG(
+ STRUCT(
+ query_pattern,
+ top_10_slot_ms_jobs,
+ avg_slot_hours,
+ days_active,
+ job_count,
+ total_gigabytes_processed,
+ avg_gigabytes_processed,
+ total_slot_hours,
+ avg_total_slot_hours_per_active_day,
+ avg_job_duration_seconds,
+ total_slots,
+ avg_total_slots
+ )
+ ORDER BY avg_slot_hours * days_active * job_count DESC LIMIT 10
+ ) AS top_10_slot_ms_patterns,
+ MAX(days_active) AS max_days_active,
+ MAX(avg_slot_hours) AS max_avg_slot_hours_across_all_patterns,
+ MAX(days_active * avg_slot_hours) AS max_weighted_avg_slot_hours,
+FROM(
+ SELECT
+ referenced_table.project_id,
+ referenced_table.dataset_id,
+ referenced_table.table_id,
+ query_info.query_hashes.normalized_literals AS query_pattern,
+ ARRAY_AGG(STRUCT(
+ bqutil.fn.job_url(jbo.project_id || ':us.' || job_id) AS job_url,
+ bqutil.fn.job_url(jbo.project_id || ':us.' || parent_job_id) AS parent_job_url
+ )
+ ORDER BY total_slot_ms DESC LIMIT 10
+ ) AS top_10_slot_ms_jobs,
+ COUNT(DISTINCT DATE(start_time)) AS days_active,
+ ARRAY_AGG(DISTINCT jbo.project_id IGNORE NULLS) AS project_ids,
+ ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids,
+ COUNT(DISTINCT job_Id) AS job_count,
+ ARRAY_AGG(DISTINCT user_email) AS user_emails,
+ SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed,
+ AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed,
+ SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours,
+ AVG(total_slot_ms) / (1000 * 60 * 60) AS avg_slot_hours,
+ SUM(total_slot_ms) / (1000 * 60 * 60) / COUNT(DISTINCT DATE(start_time)) AS avg_total_slot_hours_per_active_day,
+ AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds,
+ SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots,
+ AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots,
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo, UNNEST(referenced_tables) referenced_table
+ WHERE
+ DATE(start_time) >= CURRENT_DATE - num_days_to_scan
+ AND (
+ referenced_table.project_id||referenced_table.dataset_id||referenced_table.table_id
+ <> destination_table.project_id||destination_table.dataset_id||destination_table.table_id
+ )
+ AND state = 'DONE'
+ AND error_result IS NULL
+ AND job_type = 'QUERY'
+ AND statement_type != 'SCRIPT'
+ AND referenced_table.table_id NOT LIKE '%INFORMATION_SCHEMA%'
+ AND user_email LIKE '%gserviceaccount.com'
+ GROUP BY 1,2,3,4
+) GROUP BY 1,2,3;
diff --git a/scripts/optimization/tables_without_partitioning_or_clustering.sql b/scripts/optimization/tables_without_partitioning_or_clustering.sql
new file mode 100644
index 000000000..578309368
--- /dev/null
+++ b/scripts/optimization/tables_without_partitioning_or_clustering.sql
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, tables_without_part_clust,
+ * that contains a list of the largest tables which are:
+ * - not partitioned
+ * - not clustered
+ * - neither partitioned nor clustered
+ */
+
+DECLARE projects ARRAY DEFAULT (
+ SELECT
+ ARRAY_AGG(project_id)
+ FROM(
+ SELECT project_id
+ FROM `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION`
+ WHERE NOT deleted
+ GROUP BY 1
+ ORDER BY SUM(total_logical_bytes) DESC
+ LIMIT 100
+ )
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.tables_without_part_clust
+(
+ table_catalog STRING,
+ table_schema STRING,
+ table_name STRING,
+ table_url STRING,
+ partitioning_column STRING,
+ clustering_columns STRING,
+ logical_gigabytes FlOAT64,
+ logical_terabytes FLOAT64
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+ UNNEST(projects) project_id
+)
+DO
+BEGIN
+ EXECUTE IMMEDIATE FORMAT(
+ """
+ INSERT INTO
+ optimization_workshop.tables_without_part_clust
+ SELECT
+ s.table_catalog,
+ s.table_schema,
+ s.table_name,
+ bqutil.fn.table_url(s.table_catalog || '.' || s.table_schema || '.' || s.table_name) AS table_url,
+ partitioning_column,
+ clustering_columns,
+ SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
+ SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
+ FROM
+ `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
+ JOIN (
+ SELECT
+ table_catalog,
+ table_schema,
+ table_name,
+ STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
+ STRING_AGG(
+ CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
+ ORDER BY CLUSTERING_ORDINAL_POSITION
+ ) AS clustering_columns
+ FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
+ GROUP BY 1,2,3
+ ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
+ WHERE
+ clustering_columns IS NULL OR partitioning_column IS NULL
+ GROUP BY
+ s.table_catalog,
+ s.table_schema,
+ s.table_name,
+ table_url,
+ partitioning_column,
+ clustering_columns;
+ """,
+ p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/viewable_queries_grouped_by_hash.sql b/scripts/optimization/viewable_queries_grouped_by_hash.sql
new file mode 100644
index 000000000..ffd275f91
--- /dev/null
+++ b/scripts/optimization/viewable_queries_grouped_by_hash.sql
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, top_bytes_scanning_queries_by_hash,
+ * which contains the top 200 most expensive queries by total bytes scanned
+ * within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ * Queries are grouped by their normalized query pattern, which ignores
+ * comments, parameter values, UDFs, and literals in the query text.
+ * This allows us to group queries that are logically the same, but
+ * have different literals.
+ *
+ * For example, the following queries would be grouped together:
+ * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01'
+ * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02'
+ * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03'
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+DECLARE projects ARRAY DEFAULT (
+ SELECT
+ ARRAY_AGG(project_id)
+ FROM(
+ SELECT project_id
+ FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+ WHERE DATE(creation_time) >= CURRENT_DATE - 30
+ GROUP BY 1
+ ORDER BY SUM(total_bytes_billed) DESC
+ LIMIT 100
+ )
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.viewable_queries_grouped_by_hash
+(
+ Query_Hash STRING,
+ Query_Raw_Sample STRING,
+ Job_Origin STRING,
+ Ref_Tables STRING,
+ Days_Active INT64,
+ Job_Count INT64,
+ Avg_Job_Count_Active_Days INT64,
+ Project_Id STRING,
+ BQ_Region STRING,
+ Reservation_Id STRING,
+ Total_Gigabytes_Processed INT64,
+ Total_Gigabytes_Processed_Per_Job INT64,
+ Avg_Gigabytes_Processed INT64,
+ Total_Slot_Hours INT64,
+ Avg_Total_Slot_Hours_per_Active_Day INT64,
+ Avg_Job_Duration_Seconds INT64,
+ Any_Job_Ids ARRAY,
+ User_Emails STRING,
+ Labels STRING
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+ UNNEST(projects) project_id
+)
+DO
+BEGIN
+EXECUTE IMMEDIATE FORMAT("""
+INSERT INTO optimization_workshop.viewable_queries_grouped_by_hash
+SELECT
+ query_hash AS Query_Hash,
+ ANY_VALUE(query_raw) AS Query_Raw_Sample,
+ SPLIT(ANY_VALUE(job_ids)[OFFSET(0)], '_')[OFFSET(0)] AS Job_Origin,
+ Ref_Tables AS Ref_Tables,
+ COUNT(DISTINCT creation_dt) AS Days_Active,
+ SUM(job_count) AS Job_Count,
+ CAST(AVG(job_count) AS INT64) AS Avg_Job_Count_Active_Days,
+ Project_Id AS Project_Id,
+ 'us' AS BQ_Region,
+ Reservation_Id AS Reservation_Id,
+ CAST(SUM(total_gigabytes_processed) AS INT64) AS Total_Gigabytes_Processed,
+ CAST(SUM(total_gigabytes_processed)/sum(job_count) AS INT64) AS Total_Gigabytes_Processed_Per_Job,
+ CAST(AVG(total_gigabytes_processed) AS INT64) AS Avg_Gigabytes_Processed,
+ CAST(SUM(total_slot_hours_per_day) AS INT64) AS Total_Slot_Hours,
+ CAST(AVG(total_slot_hours_per_day) AS INT64) AS Avg_Total_Slot_Hours_per_Active_Day,
+ CAST(AVG(avg_job_duration_seconds) AS INT64) AS Avg_Job_Duration_Seconds,
+ ANY_VALUE(job_ids) AS Any_Job_Ids,
+ STRING_AGG(DISTINCT user_emails_unnest) AS User_Emails,
+ STRING_AGG(DISTINCT labels_concat) AS Labels
+FROM (
+ SELECT
+ query_hash,
+ ANY_VALUE(query_raw) AS query_raw,
+ ref_tables AS ref_tables,
+ creation_dt AS creation_dt,
+ project_id AS project_id,
+ reservation_id AS reservation_id,
+ COUNT(*) AS job_count,
+ ARRAY_AGG(job_id ORDER BY total_slot_ms DESC LIMIT 10) AS job_ids,
+ SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours_per_day,
+ SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed,
+ AVG(job_duration_seconds) AS avg_job_duration_seconds,
+ ARRAY_AGG(DISTINCT user_email) AS user_emails,
+ STRING_AGG(DISTINCT labels_concat) AS labels_concat
+ FROM (
+ SELECT
+ query_info.query_hashes.normalized_literals AS query_hash,
+ query AS query_raw,
+ DATE(jbp.creation_time) AS creation_dt,
+ jbp.project_id AS project_id,
+ jbp.reservation_id AS reservation_id,
+ jbp.job_id AS job_id,
+ jbp.total_bytes_processed AS total_bytes_processed,
+ jbp.total_slot_ms AS total_slot_ms,
+ jbp.total_slot_ms / TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, MILLISECOND) AS slots,
+ TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, SECOND) AS job_duration_seconds,
+ user_email,
+ STRING_AGG(ref_tables.project_id || '.' ||
+ IF
+ (STARTS_WITH(ref_tables.dataset_id, '_'),
+ 'TEMP',
+ ref_tables.dataset_id) || '.' || ref_tables.table_id
+ ORDER BY
+ ref_tables.project_id || '.' ||
+ IF
+ (STARTS_WITH(ref_tables.dataset_id, '_'),
+ 'TEMP',
+ ref_tables.dataset_id) || '.' || ref_tables.table_id) AS ref_tables,
+ FORMAT("%%T", ARRAY_CONCAT_AGG(labels)) AS labels_concat
+ FROM
+ `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT as jbp
+ JOIN
+ UNNEST(referenced_tables) ref_tables
+ WHERE
+ DATE(jbp.creation_time) >= CURRENT_DATE - %i
+ AND jbp.end_time > jbp.start_time
+ AND jbp.error_result IS NULL
+ AND jbp.job_type = 'QUERY'
+ AND jbp.statement_type != 'SCRIPT'
+ AND ref_tables.table_id not like '%%INFORMATION_SCHEMA%%'
+ GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+ )
+ GROUP BY 1, 3, 4, 5, 6)
+JOIN
+ UNNEST(user_emails) as user_emails_unnest
+GROUP BY
+ Query_Hash,
+ Ref_Tables,
+ Project_Id,
+ BQ_Region,
+ Reservation_Id;
+""",
+p.project_id, num_days_to_scan);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/views_with_nonoptimal_join_condition.sql b/scripts/optimization/views_with_nonoptimal_join_condition.sql
new file mode 100644
index 000000000..0ee15d0ec
--- /dev/null
+++ b/scripts/optimization/views_with_nonoptimal_join_condition.sql
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, views_with_nonoptimal_join_condition,
+ * that contains a list of views with non-optimal join conditions.
+ */
+
+DECLARE projects ARRAY DEFAULT (
+ SELECT ARRAY_AGG(DISTINCT project_id)
+ FROM `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION`
+ WHERE NOT deleted
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.views_with_nonoptimal_join_condition
+(
+ view_name STRING,
+ view_url STRING,
+ view_definition STRING,
+ join_conditions ARRAY
+);
+
+CREATE TEMP FUNCTION extract_nonoptimal_join_conditions(view_definition STRING) AS(
+ ARRAY_CONCAT(
+ REGEXP_EXTRACT_ALL(
+ REGEXP_REPLACE(UPPER(view_definition), r"\sON\s", "\nON "),
+ r"\nON\s+[A-Z_]+?\([^=]*?=[^=]*"
+ ),
+ REGEXP_EXTRACT_ALL(
+ REGEXP_REPLACE(UPPER(view_definition), r"\sON\s", "\nON "),
+ r"\nON\s+[^=]*?=\s*[A-Z_]+?\([^=]*"
+)));
+
+FOR p IN (
+ SELECT project_id
+ FROM
+ UNNEST(projects) project_id
+)
+DO
+BEGIN
+ EXECUTE IMMEDIATE FORMAT(r"""
+ INSERT INTO optimization_workshop.views_with_nonoptimal_join_condition
+ SELECT
+ table_name AS view_name,
+ bqutil.fn.table_url(table_catalog || '.' || table_schema || '.' || table_name) AS view_url,
+ view_definition,
+ extract_nonoptimal_join_conditions(view_definition) AS join_conditions
+ FROM
+ `%s.region-us.INFORMATION_SCHEMA.VIEWS`
+ WHERE
+ ARRAY_LENGTH(extract_nonoptimal_join_conditions(view_definition)) >= 1
+ """,
+ p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;