From f1e0bba812fa74eba8b5d3548041d5a59f13dc2d Mon Sep 17 00:00:00 2001
From: Daniel De Leo <danieldeleo@users.noreply.github.com>
Date: Mon, 20 Nov 2023 11:14:35 -0500
Subject: [PATCH] Adding BigQuery optimization scripts (#376)

* add optimization scripts

* Add script to README for enabling recommender API and add new script for investigating tables with high freq DML

* add comments

* Adding comments to existing scripts

* add default to variable

* linting

* add script for harvesting query perf insights

* nit

* improve and add comments

* remove query hashing UDF and instead use existing query hash field in the info schema views

* Update docs

* remove unnecessary check

* update largest tables scripts (#375)

* Add frequent_daily_table_dml to README and nits

* markdown formatting

* revert formatting

* remove experimental script

* standardizing scripts to use the same num_days_to_scan variable

* refactor scripts to not do any sorting

* add missing INSERT

* revert

* remove unnecessary INSERTs and use bqutil UDF

* comments fixes

* fix regex on nonoptimal view script

* run table_read_patterns script before other scripts

* remove unnecessary alias

* remove unnecessary filter

* only check for completed queries

* fix perf insights sql

* simplify queries_grouped_by_hash.sql by making it query over the jobs_by_org view and removed a lot of the unnecessary sql logic

* Adding initial commit of queries_grouped_by_script.sql

* Add a version of queries_grouped_by_hash.sql that retains the raw sql query in case it's useful in future

* aggregate project and reservation ids

* use start_time instead of create_time when scanning history

* aggregate ref_tables as an array instead of comma separated string

* remove unnecessary column

* simplify check for perf insights

* improve query insights scripts and readme

* readme fixes and adding api enabling script

* linting

* add --quiet to bq tool

* Updates

* Update table_read_patterns_by_query_hash.sql

* Update table_read_patterns_by_query_hash.sql

* Update table_read_patterns_by_query_hash.sql

* Update table_read_patterns_by_query_hash.sql

* Update queries_grouped_by_script.sql

* Update queries_grouped_by_script.sql

* Add script queries_grouped_by_labels.sql

* order by total_slot_ms only

* fixes

* add hourly_slot_consumption_by_query_hash.sql

* move file

---------

Co-authored-by: ddeleo <danieldeleo@google.com>
Co-authored-by: Christian Yarros <yarros@google.com>
Co-authored-by: Raj Patel <57837394+rajc242@users.noreply.github.com>
---
 scripts/README.md                             |   3 +-
 scripts/optimization/README.md                | 443 ++++++++++++++++++
 ...bles_with_partitioning_clustering_info.sql | 121 +++++
 ...bles_with_partitioning_clustering_info.sql | 103 ++++
 .../table_read_patterns.sql                   |  57 +++
 .../optimization/daily_project_analysis.sql   |  94 ++++
 .../optimization/enable_recommender_api.sh    |  30 ++
 .../optimization/frequent_daily_table_dml.sql |  52 ++
 .../hourly_slot_consumption_by_query_hash.sql | 128 +++++
 .../optimization/queries_grouped_by_hash.sql  |  84 ++++
 .../queries_grouped_by_labels.sql             |  41 ++
 .../queries_grouped_by_script.sql             |  77 +++
 .../query_performance_insights.sql            |  51 ++
 scripts/optimization/run_all_scripts.sh       |  41 ++
 scripts/optimization/table_read_patterns.sql  | 205 ++++++++
 .../table_read_patterns_by_query_hash.sql     |  90 ++++
 ...les_without_partitioning_or_clustering.sql |  99 ++++
 .../viewable_queries_grouped_by_hash.sql      | 168 +++++++
 .../views_with_nonoptimal_join_condition.sql  |  70 +++
 19 files changed, 1956 insertions(+), 1 deletion(-)
 create mode 100644 scripts/optimization/README.md
 create mode 100644 scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql
 create mode 100644 scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql
 create mode 100644 scripts/optimization/assured_workloads_compliant/table_read_patterns.sql
 create mode 100644 scripts/optimization/daily_project_analysis.sql
 create mode 100644 scripts/optimization/enable_recommender_api.sh
 create mode 100644 scripts/optimization/frequent_daily_table_dml.sql
 create mode 100644 scripts/optimization/hourly_slot_consumption_by_query_hash.sql
 create mode 100644 scripts/optimization/queries_grouped_by_hash.sql
 create mode 100644 scripts/optimization/queries_grouped_by_labels.sql
 create mode 100644 scripts/optimization/queries_grouped_by_script.sql
 create mode 100644 scripts/optimization/query_performance_insights.sql
 create mode 100644 scripts/optimization/run_all_scripts.sh
 create mode 100644 scripts/optimization/table_read_patterns.sql
 create mode 100644 scripts/optimization/table_read_patterns_by_query_hash.sql
 create mode 100644 scripts/optimization/tables_without_partitioning_or_clustering.sql
 create mode 100644 scripts/optimization/viewable_queries_grouped_by_hash.sql
 create mode 100644 scripts/optimization/views_with_nonoptimal_join_condition.sql

diff --git a/scripts/README.md b/scripts/README.md
index 9a3b948b8..27cef9c9b 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -2,4 +2,5 @@
 
 Common script utilities which can help automate common tasks within a BigQuery data warehouse.
 
-* [billing](/scripts/billing) - Example queries over the GCP billing export to help get you started using the standard data export format
\ No newline at end of file
+* [billing](/scripts/billing) - Example queries over the GCP billing export to help get you started using the standard data export format.
+* [optimization](/scripts/optimization) - Scripts that identify tables and queries which require optimization.
\ No newline at end of file
diff --git a/scripts/optimization/README.md b/scripts/optimization/README.md
new file mode 100644
index 000000000..3aff72c85
--- /dev/null
+++ b/scripts/optimization/README.md
@@ -0,0 +1,443 @@
+# Optimization Scripts
+
+This folder contains scripts that (when executed) create a dataset
+named, `optimization_workshop`, with several tables inside the dataset. \
+These tables are populated with information to help you optimize your BigQuery
+tables, views, and queries.
+
+Run all the scripts within this folder using the following commands:
+
+```bash
+gcloud auth login &&
+bash run_all_scripts.sh
+```
+
+The scripts are described in more detail in the following sections.
+
+---
+
+# Project Analysis
+
+Project level analysis enables us to understand key metrics such as slot_time,
+bytes_scanned, bytes_shuffled and bytes_spilled on a daily basis within a
+project. The metrics are examined as averages, medians and p80s. This enables us
+to understand at a high level what jobs within a project consume 80% of the time
+and 50% of the time daily.
+
+<details><summary><b>&#128269; Daily project metrics</b></summary>
+
+## Daily project metrics
+
+The [daily_project_analysis.sql](daily_project_analysis.sql) script creates a
+table called,
+`daily_project_analysis` of daily slot consumption metrics (for a 30day period)
+for all your projects.
+
+### Examples of querying script results
+
+* Top 100 tables with the highest slot consumption
+
+    ```sql
+    SELECT *
+    FROM optimization_workshop.daily_project_analysis
+    ORDER BY total_slot_ms DESC
+    LIMIT 100
+    ```
+
+</details>
+
+# Table Analysis
+
+<details><summary><b>&#128269; BigQuery Clustering/Partitioning Recommender Tool</b></summary>
+
+## BigQuery Clustering/Partitioning Recommender Tool
+
+The BigQuery partitioning and clustering recommender analyzes workflows on your
+BigQuery tables and offers recommendations to better optimize your workflows and
+query costs using either table partitioning or table clustering. The recommender
+uses your BigQuery's workload execution data from the past 30 days to analyze
+each BigQuery table for suboptimal partitioning and clustering configurations.
+
+> [!IMPORTANT]
+> Before you can view partition and cluster recommendations, you
+> need to [enable the Recommender API](https://cloud.google.com/recommender/docs/enabling)
+> as shown in the following sections.
+
+### Enable using gcloud
+
+```bash
+# The following script retrieves all distinct projects from the JOBS_BY_ORGANIZATION view
+# and then enables the recommender API for each project.
+projects=$(
+  bq query \
+    --nouse_legacy_sql \
+    --format=csv \
+    "SELECT DISTINCT project_id FROM \`region-us\`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION" \
+    | sed 1d
+);
+for proj in $projects; do
+  gcloud services --project="${proj}" enable recommender.googleapis.com &
+done
+```
+
+### Enable using Terraform
+
+```hcl
+resource "google_project_service" "recommender_service" {
+  project = "your-project"
+  service = "recommender.googleapis.com"
+}
+```
+
+### View your partition and cluster recommendations
+
+Once you've enabled the Recommender API,
+you can [view your partition and cluster recommendations](https://cloud.google.com/bigquery/docs/view-partition-cluster-recommendations#view_recommendations)
+in the Cloud Console or via the gcloud command-line tool.
+
+> [!NOTE]
+> The most scalable method for viewing your partition and cluster
+> recommendations is to export your recommendations to BigQuery. 
+> You can do this by creating a Data Transfer Service (DTS) job to export your
+> recommendations to BigQuery.
+> See [Exporting recommendations to BigQuery](https://cloud.google.com/recommender/docs/bq-export/export-recommendations-to-bq#create_a_data_transfer_for_recommendations)
+> for more information.
+
+Once you set up the DTS job to export your recommendations to BigQuery, you can 
+run the following query to get the most recent recommendations for partitioning
+and clustering your tables.
+
+```sql
+CREATE TEMP FUNCTION extract_table(target_resources ARRAY<STRING>) AS((
+  SELECT ARRAY_AGG(
+    REGEXP_REPLACE(REGEXP_EXTRACT(target_resource, r'\/projects\/(.*?\/datasets\/.*?\/tables\/.*)'), "(/datasets/|/tables/)", ".")
+  ) 
+  FROM UNNEST(target_resources) target_resource)[OFFSET(0)]
+);
+SELECT
+  MAX(last_refresh_time) AS latest_recommendation_time,
+  recommender_subtype AS recommendation,
+  MAX_BY(JSON_VALUE_ARRAY(COALESCE(
+    PARSE_JSON(recommendation_details).overview.partitionColumns,
+    PARSE_JSON(recommendation_details).overview.clusterColumns)),
+  last_refresh_time) AS columns_to_cluster_or_partition,
+  extract_table(target_resources) AS table,
+  bqutil.fn.table_url(extract_table(target_resources)) AS table_url,
+  JSON_VALUE(PARSE_JSON(recommendation_details).overview.partitionTimeUnit) AS partition_time_unit,
+-- Replace the table below with your own table that you created when you 
+-- set up the DTS job to export your recommendations to BigQuery.
+FROM YOUR_PROJECT.YOUR_DATASET.recommendations_export
+WHERE recommender = "google.bigquery.table.PartitionClusterRecommender"
+GROUP BY recommendation, table, partition_time_unit, table_url
+```
+
+</details>
+
+<details><summary><b>&#128269; Tables with query read patterns</b></summary>
+
+## Tables with query read patterns
+
+The [table_read_patterns.sql](table_read_patterns.sql) script creates a table
+named, `table_read_patterns`, that contains usage data to help you determine:
+
+* Which tables (when queried) are resulting in high slot consumption.
+* Which tables are most frequently queried.
+
+### Examples of querying script results
+
+* Tables grouped by similar filter predicates
+
+    ```sql
+    SELECT
+      table_id,
+      bqutil.fn.table_url(table_id) AS table_url,
+      (SELECT STRING_AGG(column ORDER BY COLUMN) FROM UNNEST(predicates)) column_list,
+      (SELECT STRING_AGG(operator ORDER BY COLUMN) FROM UNNEST(predicates)) operator_list,
+      (SELECT STRING_AGG(value ORDER BY COLUMN) FROM UNNEST(predicates)) value_list,
+      SUM(stage_slot_ms) AS total_slot_ms,
+      COUNT(DISTINCT DATE(creation_time)) as num_days_queried,
+      COUNT(*) AS num_occurrences,
+      COUNT(DISTINCT job_id) as job_count,
+      ARRAY_AGG(CONCAT(project_id,':us.',job_id) ORDER BY total_slot_ms LIMIT 10) AS job_id_array,
+      ARRAY_AGG(bqutil.fn.job_url(project_id || ':us.' || job_id)) AS job_url_array,
+    FROM optimization_workshop.table_read_patterns
+    GROUP BY 1,2,3,4,5;
+    ```
+
+* Top 100 tables with the highest slot consumption
+
+    ```sql  
+    SELECT
+      table_id,
+      bqutil.fn.table_url(table_id) AS table_url,
+      SUM(stage_slot_ms) AS total_slot_ms,
+      COUNT(DISTINCT DATE(creation_time)) as num_days_queried,
+      COUNT(*) AS num_occurrences,
+      COUNT(DISTINCT job_id) as job_count,
+    FROM optimization_workshop.table_read_patterns
+    GROUP BY 1,2
+    ORDER BY total_slot_ms DESC
+    LIMIT 100
+    ```
+
+* Top 100 most frequently queried tables
+
+    ```sql
+    SELECT
+      table_id,
+      bqutil.fn.table_url(table_id) AS table_url,
+      SUM(stage_slot_ms) AS total_slot_ms,
+      COUNT(DISTINCT DATE(creation_time)) as num_days_queried,
+      COUNT(*) AS num_occurrences,
+      COUNT(DISTINCT job_id) as job_count,
+    FROM optimization_workshop.table_read_patterns
+    GROUP BY 1,2
+    ORDER BY num_occurrences DESC
+    LIMIT 100
+    ```
+
+</details>
+
+<details><summary><b>&#128269; Tables without partitioning or clustering</b></summary>
+
+## Tables without partitioning or clustering
+
+The [tables_without_partitioning_or_clustering.sql](tables_without_partitioning_or_clustering.sql)
+script creates a table named, `tables_without_part_clust`,
+that contains a list of tables which meet any of the following conditions:
+
+- not partitioned
+- not clustered
+- neither partitioned nor clustered
+
+### Examples of querying script results
+
+* Top 100 largest tables without partitioning or clustering
+
+    ```sql
+    SELECT *
+    FROM optimization_workshop.tables_without_part_clust
+    ORDER BY logical_gigabytes DESC
+    LIMIT 100
+    ```
+
+</details>
+
+<details><summary><b>&#128269; Actively read tables with partitioning and clustering information</b></summary>
+
+## Actively read tables with partitioning and clustering information
+
+> [!IMPORTANT]
+> The [actively_read_tables_with_partitioning_clustering_info.sql](optimization/actively_read_tables_with_partitioning_clustering_info.sql) 
+> script depends on the `table_read_patterns` table so you must first run the
+> [table_read_patterns.sql](optimization/table_read_patterns.sql) script.
+
+The [actively_read_tables_with_partitioning_clustering_info.sql](actively_read_tables_with_partitioning_clustering_info.sql)
+script creates a table named, `actively_read_tables_with_part_clust_info`
+that contains a list of actively read tables along with their partitioning and
+clustering information.
+
+### Examples of querying script results
+
+* Top 100 largest **actively read** tables without partitioning or clustering
+
+    ```sql
+    SELECT *
+    FROM optimization_workshop.actively_read_tables_with_part_clust_info
+    WHERE clustering_columns IS NULL OR partitioning_column IS NULL
+    ORDER BY logical_gigabytes DESC
+    LIMIT 100
+    ```
+
+</details>
+
+<details><summary><b>&#128269; Tables receiving high quantity of daily DML statements</b></summary>
+
+## Tables receiving high quantity of daily DML statements
+
+The [frequent_daily_table_dml.sql](frequent_daily_table_dml.sql) script creates
+a table named, `frequent_daily_table_dml`, that contains tables that have had
+more than 24 daily DML statements run against them in the past 30 days.
+
+### Examples of querying script results
+
+* Top 100 tables with the most DML statements per table in a day
+
+  ```sql
+  SELECT
+    table_id,
+    table_url,
+    ANY_VALUE(dml_execution_date HAVING MAX daily_dml_per_table) AS sample_dml_execution_date,
+    ANY_VALUE(job_urls[OFFSET(0)] HAVING MAX daily_dml_per_table) AS sample_dml_job_url,
+    MAX(daily_dml_per_table) max_daily_table_dml,
+  FROM optimization_workshop.frequent_daily_table_dml
+  GROUP BY table_id, table_url
+  ORDER BY max_daily_table_dml DESC
+  LIMIT 100;
+  ```
+
+</details>
+
+<details><summary><b>&#128269; Views with non-optimal JOIN conditions</b></summary>
+
+## Views with non-optimal JOIN conditions
+
+The [views_with_nonoptimal_join_condition.sql](views_with_nonoptimal_join_condition.sql)
+script creates a table named, `views_with_nonoptimal_join_condition`, that
+contains views with JOINs where the JOIN condition is potentially non-optimal.
+
+</details>
+
+# Query Analysis
+
+<details><summary><b>&#128269; Queries grouped by hash</b></summary>
+
+## Queries grouped by hash
+
+The [queries_grouped_by_hash.sql](queries_grouped_by_hash.sql) script creates a
+table named,
+`queries_grouped_by_hash`. This table groups queries by their normalized query
+pattern, which ignores
+comments, parameter values, UDFs, and literals in the query text.
+This allows us to group queries that are logically the same, but
+have different literals. The `queries_grouped_by_hash` table does not expose the
+raw SQL text of the queries.
+
+The [viewable_queries_grouped_by_hash.sql](viewable_queries_grouped_by_hash.sql)
+script creates a table named,
+`viewable_queries_grouped_by_hash`. This table is similar to
+the `queries_grouped_by_hash` table, but it
+exposes the raw SQL text of the queries.
+The `viewable_queries_grouped_by_hash.sql` script runs much slower
+in execution than the `queries_grouped_by_hash.sql` script because it has to
+loop over all projects and for each
+project query the `INFORMATION_SCHEMA.JOBS_BY_PROJECT` view.
+
+For example, the following queries would be grouped together because the date
+literal filters are ignored:
+
+```sql
+SELECT * FROM my_table WHERE date = '2020-01-01';
+SELECT * FROM my_table WHERE date = '2020-01-02';
+SELECT * FROM my_table WHERE date = '2020-01-03';
+```
+
+### Examples of querying script results
+
+* Top 100 queries with the highest bytes processed
+
+  ```sql
+  SELECT *
+  FROM optimization_workshop.queries_grouped_by_hash
+  ORDER BY total_gigabytes_processed DESC
+  LIMIT 100
+  ```
+
+* Top 100 recurring queries with the highest slot hours consumed
+
+  ```sql
+  SELECT *
+  FROM optimization_workshop.queries_grouped_by_hash
+  ORDER BY total_slot_hours * days_active * job_count DESC 
+  LIMIT 100
+  ```
+
+</details>
+
+<details><summary><b>&#128269; Queries grouped by script</b></summary>
+
+## Queries grouped by script
+
+The [queries_grouped_by_script.sql](queries_grouped_by_script.sql) script
+creates a table named,
+`queries_grouped_by_script`. This table groups queries by their parent job id
+which means that any queries that
+were executed as part of a script (multi-statement query) will be grouped
+together. This table is helpful if you
+want to see which particular scripts are most inefficient.
+
+### Examples of querying script results
+
+* Top 100 scripts with the highest bytes processed
+
+  ```sql
+  SELECT *
+  FROM optimization_workshop.queries_grouped_by_script
+  ORDER BY total_gigabytes_processed DESC
+  LIMIT 100
+  ```
+
+* Top 100 scripts with the highest slot hours consumed
+
+  ```sql
+  SELECT *
+  FROM optimization_workshop.queries_grouped_by_script
+  ORDER BY total_slot_hours DESC
+  LIMIT 100
+  ```
+
+* Top 100 scripts with the highest slot hours consumed that doesn't include
+  INFO_SCHEMA views
+
+  ```sql
+  SELECT *
+  FROM optimization_workshop.queries_grouped_by_script
+  WHERE NOT EXISTS(
+    SELECT 1
+    FROM UNNEST(referenced_tables) table
+    WHERE table LIKE "INFORMATION_SCHEMA%"
+  )
+  ORDER BY total_slots DESC
+  LIMIT 100
+  ```
+
+</details>
+
+<details><summary><b>&#128269; Hourly slot consumption by query hash</b></summary>
+
+## Hourly slot consumption by query hash
+
+The [hourly_slot_consumption_by_query_hash](hourly_slot_consumption_by_query_hash.sql) script
+creates a table named,
+`hourly_slot_consumption_by_query_hash`. This table groups queries by their query hash and then shows 
+for every hour timeslice how much of that hour's slots each query hash consumed.
+
+</details>
+
+<details><summary><b>&#128269; Queries with performance insights</b></summary>
+
+## Queries with performance insights
+
+The [query_performance_insights.sql](query_performance_insights.sql) script
+creates a table named, `query_performance_insights` retrieves all queries that
+have had performance insights
+generated for them in the past 30 days.
+
+### Examples of querying script results
+
+* Top 100 queries with most # of performance insights
+
+  ```sql
+  SELECT
+    job_url,
+    (SELECT COUNT(1)
+    FROM UNNEST(performance_insights.stage_performance_standalone_insights) perf_insight
+    WHERE perf_insight.slot_contention
+    ) AS num_stages_with_slot_contention,
+    (SELECT COUNT(1)
+    FROM UNNEST(performance_insights.stage_performance_standalone_insights) perf_insight
+    WHERE perf_insight.insufficient_shuffle_quota
+    ) AS num_stages_with_insufficient_shuffle_quota,
+    (SELECT ARRAY_AGG(perf_insight.input_data_change.records_read_diff_percentage IGNORE NULLS)
+    FROM UNNEST(performance_insights.stage_performance_change_insights) perf_insight
+    ) AS records_read_diff_percentages
+  FROM optimization_workshop.query_performance_insights
+  ORDER BY (
+    num_stages_with_slot_contention 
+    + num_stages_with_insufficient_shuffle_quota
+    + ARRAY_LENGTH(records_read_diff_percentages)
+  ) DESC
+  LIMIT 100
+  ```
+
+</details>
diff --git a/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql b/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql
new file mode 100644
index 000000000..662194d8d
--- /dev/null
+++ b/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql
@@ -0,0 +1,121 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, actively_read_tables_with_part_clust_info,
+ * that contains a list of the most frequently read tables which are:
+ *     - not partitioned
+ *     - not clustered
+ *     - neither partitioned nor clustered
+ */
+
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT ARRAY_AGG(DISTINCT project_id)
+  FROM optimization_workshop.table_read_patterns
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info
+(
+  project_id STRING,
+  dataset_id STRING,
+  table_id STRING,
+  total_slot_ms FLOAT64,
+  total_jobs INT64,
+  max_slot_ms_job_url STRING,
+  num_days_queried INT64,
+  predicate_columns STRING,
+  partitioning_column STRING,
+  clustering_columns STRING,
+  table_url STRING,
+  logical_gigabytes FlOAT64,
+  logical_terabytes FLOAT64
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+   UNNEST(projects) project_id
+)
+DO 
+BEGIN
+  EXECUTE IMMEDIATE FORMAT("""
+  INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info
+  SELECT
+    rp.* EXCEPT(predicates),
+    ARRAY_TO_STRING(ARRAY_CONCAT_AGG((
+      SELECT
+        ARRAY_AGG(predicate_column_counts.column || ':' || predicate_column_counts.cnt)
+      FROM (
+        SELECT
+          STRUCT(predicate.column, COUNT(predicate.column) AS cnt) AS predicate_column_counts
+        FROM UNNEST(predicates) predicate
+        GROUP BY predicate.column
+      ))), ', ') AS predicate_columns,
+    partitioning_column,
+    clustering_columns,
+    bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url,
+    SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
+    SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
+  FROM
+    `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
+  JOIN (
+    SELECT
+      project_id,
+      dataset_id,
+      table_id,
+      ARRAY_CONCAT_AGG(predicates) AS predicates,
+      SUM(stage_slot_ms) AS total_slot_ms,
+      COUNT(DISTINCT job_id) AS total_jobs,
+      ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url,
+      COUNT(DISTINCT date) AS num_days_queried,
+    FROM optimization_workshop.table_read_patterns
+    GROUP BY
+      project_id,
+      dataset_id,
+      table_id
+    ) rp
+    ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id)
+  JOIN (
+    SELECT
+      table_catalog,
+      table_schema,
+      table_name,
+      STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
+      STRING_AGG(
+        CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
+        ORDER BY CLUSTERING_ORDINAL_POSITION
+      ) AS clustering_columns
+    FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
+    GROUP BY 1,2,3
+  ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
+  GROUP BY
+    project_id,
+    dataset_id,
+    table_id,
+    table_url,
+    total_slot_ms,
+    total_jobs,
+    max_slot_ms_job_url,
+    num_days_queried,
+    partitioning_column,
+    clustering_columns,
+    total_slot_ms;
+  """,
+  p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql b/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql
new file mode 100644
index 000000000..bd2fe1370
--- /dev/null
+++ b/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql
@@ -0,0 +1,103 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT ARRAY_AGG(DISTINCT project_id)
+  FROM optimization_workshop.table_read_patterns
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info
+(
+  project_id STRING,
+  dataset_id STRING,
+  table_id STRING,
+  total_slot_ms FLOAT64,
+  total_jobs INT64,
+  max_slot_ms_job_url STRING,
+  num_days_queried INT64,
+  partitioning_column STRING,
+  clustering_columns STRING,
+  table_url STRING,
+  logical_gigabytes FlOAT64,
+  logical_terabytes FLOAT64
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+   UNNEST(projects) project_id
+)
+DO 
+BEGIN
+  EXECUTE IMMEDIATE FORMAT("""
+  INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info
+  SELECT
+    rp.*,
+    partitioning_column,
+    clustering_columns,
+    bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url,
+    SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
+    SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
+  FROM
+    `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
+  JOIN (
+    SELECT
+      project_id,
+      dataset_id,
+      table_id,
+      SUM(total_slot_ms) AS total_slot_ms,
+      COUNT(DISTINCT job_id) AS total_jobs,
+      ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url,
+      COUNT(DISTINCT date) AS num_days_queried,
+    FROM optimization_workshop.table_read_patterns
+    GROUP BY
+      project_id,
+      dataset_id,
+      table_id
+    ) rp
+    ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id)
+  JOIN (
+    SELECT
+      table_catalog,
+      table_schema,
+      table_name,
+      STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
+      STRING_AGG(
+        CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
+        ORDER BY CLUSTERING_ORDINAL_POSITION
+      ) AS clustering_columns
+    FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
+    GROUP BY 1,2,3
+  ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
+  GROUP BY
+    project_id,
+    dataset_id,
+    table_id,
+    table_url,
+    total_slot_ms,
+    total_jobs,
+    max_slot_ms_job_url,
+    num_days_queried,
+    partitioning_column,
+    clustering_columns,
+    total_slot_ms;
+  """,
+  p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
\ No newline at end of file
diff --git a/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql b/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql
new file mode 100644
index 000000000..659ec4633
--- /dev/null
+++ b/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The following script creates a table named, table_read_patterns,
+ * that contains a list of the most frequently read tables within the
+ * past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns
+CLUSTER BY project_id, dataset_id, table_id AS
+SELECT
+  DATE(creation_time) AS date,
+  jbo.project_id,
+  table.dataset_id,
+  table.table_id,
+  table.project_id || '.' || table.dataset_id || '.' || table.table_id AS full_table_id,
+  job_id,
+  bqutil.fn.job_url(jbo.project_id || ':us.' || job_id) AS job_url,
+  parent_job_id,
+  bqutil.fn.job_url(jbo.project_id || ':us.' || parent_job_id) AS parent_job_url,
+  reservation_id,
+  total_bytes_billed,
+  total_slot_ms,
+  creation_time,
+  start_time,
+  end_time
+FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo,
+UNNEST(referenced_tables) table
+WHERE
+  DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+  AND (
+    table.project_id||table.dataset_id||table.table_id
+    <> destination_table.project_id||destination_table.dataset_id||destination_table.table_id
+  )
+  AND job_type = 'QUERY'
+  AND statement_type != 'SCRIPT'
+  AND NOT cache_hit
+  AND error_result IS NULL;
diff --git a/scripts/optimization/daily_project_analysis.sql b/scripts/optimization/daily_project_analysis.sql
new file mode 100644
index 000000000..4b30a848b
--- /dev/null
+++ b/scripts/optimization/daily_project_analysis.sql
@@ -0,0 +1,94 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table that contains daily slot consumption information
+ * about BigQuery jobs for a given set of projects within the past 30 days.
+ *
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.daily_project_analysis AS
+SELECT
+  day, 
+  project_id,
+  COUNT(*)                            AS job_count,
+  AVG(total_slot_ms)/1000             AS avg_total_slot_secs,
+  MAX(median_total_slot_ms)/1000      AS median_total_slot_secs,
+  MAX(p80_total_slot_ms)/1000         AS p80_total_slot_secs,
+  SUM(total_slot_ms)/1000/60/60       AS total_slot_hours,
+  AVG(time_secs)                      AS avg_time_secs,
+  MAX(median_time_secs)               AS median_time_secs,
+  SUM(time_secs)/60/60                AS total_time_hours,
+  MAX(p80_time_secs)                  AS p80_time_secs,
+  AVG(bytes_scanned)/POW(1024,3)      AS avg_gb_scanned,
+  MAX(p80_bytes_scanned)/POW(1024,3)  AS p80_gb_scanned,
+  SUM(bytes_scanned)/POW(1024,4)      AS total_tb_scanned,
+  AVG(bytes_shuffled)/POW(1024,3)     AS avg_gb_shuffled,
+  MAX(p80_bytes_shuffled)/POW(1024,3) AS p80_gb_shuffled,
+  SUM(bytes_shuffled)/POW(1024,4)     AS total_tb_shuffled,
+  AVG(bytes_spilled)/POW(1024,3)      AS avg_gb_spilled,
+  MAX(p80_bytes_spilled)/POW(1024,3)  AS p80_gb_spilled,
+  SUM(bytes_spilled)/POW(1024,4)      AS total_tb_spilled,
+FROM(
+  SELECT
+    day,
+    project_id,
+    total_slot_ms,
+    PERCENTILE_CONT(total_slot_ms, 0.5) 
+      OVER (PARTITION BY day, project_id)     AS median_total_slot_ms,
+    PERCENTILE_CONT(total_slot_ms, 0.8) 
+      OVER (PARTITION BY day, project_id)     AS p80_total_slot_ms,
+    time_secs,
+    PERCENTILE_CONT(time_secs, 0.5) 
+      OVER (PARTITION BY day, project_id)     AS median_time_secs,
+    PERCENTILE_CONT(time_secs, 0.8) 
+      OVER (PARTITION BY day, project_id)     AS p80_time_secs,
+    total_bytes_scanned bytes_scanned,
+    PERCENTILE_CONT(total_bytes_scanned, 0.8) 
+      OVER (PARTITION BY day, project_id)     AS p80_bytes_scanned,
+    bytes_shuffled,
+    PERCENTILE_CONT(bytes_shuffled, 0.8) 
+      OVER (PARTITION BY day, project_id)     AS p80_bytes_shuffled,
+    bytes_spilled,
+    PERCENTILE_CONT(bytes_spilled, 0.8) 
+      OVER (PARTITION BY day, project_id)     AS p80_bytes_spilled
+  FROM(
+    SELECT 
+      DATE(jbo.creation_time) AS day,
+      project_id,
+      job_id,
+      total_slot_ms,
+      TIMESTAMP_DIFF(jbo.end_time,jbo.start_time, SECOND) AS time_secs,
+      total_bytes_billed total_bytes_scanned,
+      (SELECT SUM(stage.shuffle_output_bytes) FROM UNNEST(job_stages) stage) AS bytes_shuffled,
+      (SELECT SUM(stage.shuffle_output_bytes_spilled) FROM UNNEST(job_stages) stage) AS bytes_spilled
+    FROM
+      `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo
+    WHERE
+      DATE(jbo.creation_time) >= CURRENT_DATE - num_days_to_scan
+      -- Uncomment below to specify a list of projects to analyze
+      -- AND jbo.project_id IN (<LIST_OF_PROJECT_IDS>)
+      AND jbo.job_type = 'QUERY'
+      AND jbo.end_time > jbo.start_time
+      AND jbo.error_result IS NULL
+      AND jbo.statement_type != 'SCRIPT'
+))
+GROUP BY 1, 2;
diff --git a/scripts/optimization/enable_recommender_api.sh b/scripts/optimization/enable_recommender_api.sh
new file mode 100644
index 000000000..6aeca2742
--- /dev/null
+++ b/scripts/optimization/enable_recommender_api.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The following script retrieves all distinct projects from the JOBS_BY_ORGANIZATION view
+# and then enables the recommender API for each project.
+# This is useful for when you have a large number of projects and you want to enable the
+# recommender API for all of them.
+projects=$(
+  bq query \
+    --nouse_legacy_sql \
+    --format=csv \
+    "SELECT DISTINCT project_id FROM \`region-us\`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION" |
+    sed 1d
+)
+for proj in $projects; do
+  gcloud services --project="${proj}" enable recommender.googleapis.com &
+done
diff --git a/scripts/optimization/frequent_daily_table_dml.sql b/scripts/optimization/frequent_daily_table_dml.sql
new file mode 100644
index 000000000..897424598
--- /dev/null
+++ b/scripts/optimization/frequent_daily_table_dml.sql
@@ -0,0 +1,52 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The following script will return tables that have had > 24 DML statements
+ * run against in any one day within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.frequent_daily_table_dml AS
+SELECT
+  destination_table.project_id,
+  destination_table.dataset_id,
+  destination_table.table_id,
+  EXTRACT(DATE FROM creation_time) AS dml_execution_date,
+  bqutil.fn.table_url(destination_table.project_id || '.' || destination_table.dataset_id || '.' || destination_table.table_id) AS table_url,
+  COUNT(1) AS daily_dml_per_table,
+  ARRAY_AGG(job_id) AS job_ids,
+  ARRAY_AGG(bqutil.fn.job_url(project_id || ':us.' || job_id) IGNORE NULLS) AS job_urls,
+  ARRAY_AGG(DISTINCT statement_type) AS statement_types,
+  SUM(SAFE_DIVIDE(total_bytes_processed, pow(2,30))) AS sum_total_gb_processed,
+  AVG(SAFE_DIVIDE(total_bytes_processed, pow(2,30))) AS avg_total_gb_processed,
+  SUM(total_slot_ms) AS sum_total_slot_ms,
+  AVG(total_slot_ms) AS avg_total_slot_ms,
+  SUM(SAFE_DIVIDE(total_slot_ms, (TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)))) AS sum_avg_slots,
+  AVG(SAFE_DIVIDE(total_slot_ms, (TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)))) AS avg_avg_slots,
+FROM
+  `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+  -- Look at the past 30 days of jobs
+  DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+  -- Only look at DML statements
+  AND statement_type IN ('INSERT', 'UPDATE', 'DELETE', 'MERGE')
+  GROUP BY 1,2,3,4,5
+  HAVING daily_dml_per_table > 24;
diff --git a/scripts/optimization/hourly_slot_consumption_by_query_hash.sql b/scripts/optimization/hourly_slot_consumption_by_query_hash.sql
new file mode 100644
index 000000000..337e5e2b9
--- /dev/null
+++ b/scripts/optimization/hourly_slot_consumption_by_query_hash.sql
@@ -0,0 +1,128 @@
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+DECLARE my_reservation_id STRING DEFAULT "your_reservation_id";
+
+
+CREATE OR REPLACE TABLE `optimization_workshop.hourly_slot_consumption_by_query_hash` AS
+SELECT * EXCEPT(jobs_per_hour),
+-- Get the top 10 query patterns by slot hours
+ARRAY(
+  SELECT AS STRUCT
+    -- since period is hourly, period_slot_hours/(1hr duration) = period_num_slots
+    ROUND(period_slot_hours, 2) AS num_period_slots,
+    ROUND(period_slot_hours / period_total_slot_hours * 100, 2) || "%" AS pct_of_total_period,
+    COALESCE(slot_hunger,0) AS slot_hunger,
+    ROUND(avg_period_estimated_runnable_units,2) AS avg_period_estimated_runnable_units,
+    ROUND(max_duration_minutes, 2) AS max_duration_minutes,
+    ROUND(max_p90_period_slots, 2) AS max_p90_period_slots,
+    ROUND(p90_avg_job_period_slots, 2) AS p90_avg_job_period_slots,
+    ROUND(p90_duration_minutes, 2) AS p90_duration_minutes,
+    query_hash,
+    num_days_query_hash_active,
+    labels,
+    num_jobs_per_period,
+    job_url,
+    parent_job_url,
+  FROM(
+    SELECT
+      query_hash,
+      ANY_VALUE(num_days_query_hash_active)            AS num_days_query_hash_active,
+      ARRAY_AGG(DISTINCT FORMAT("%T", labels))         AS labels,
+      MAX(duration_ms)/1000/60                         AS max_duration_minutes,
+      SUM(slot_hunger)                                 AS slot_hunger,
+      AVG(avg_period_estimated_runnable_units)         AS avg_period_estimated_runnable_units,
+      MAX(p90_job_period_slots)                        AS max_p90_period_slots,
+      ANY_VALUE(p90_avg_job_period_slots)              AS p90_avg_job_period_slots,
+      ANY_VALUE(p90_duration_minutes)                  AS p90_duration_minutes,
+      SUM(period_slot_hours)                           AS period_slot_hours,
+      COUNT(1)                                         AS num_jobs_per_period,
+      ANY_VALUE(job_url        HAVING MAX duration_ms) AS job_url,
+      ANY_VALUE(parent_job_url HAVING MAX duration_ms) AS parent_job_url,
+      ANY_VALUE(user_email     HAVING MAX duration_ms) AS user_email,
+    FROM (
+      SELECT *,
+        PERCENTILE_CONT(duration_ms, 0.9) OVER(
+          PARTITION BY query_hash
+        )/1000/60 AS p90_duration_minutes,
+        PERCENTILE_CONT(avg_job_period_slots, 0.9) OVER(
+          PARTITION BY query_hash
+        ) AS p90_avg_job_period_slots,
+      FROM UNNEST(jobs_per_hour)
+    )
+    GROUP BY 1
+    ORDER BY period_slot_hours DESC
+    LIMIT 10
+  )
+) AS top_job_patterns_per_hour
+FROM(
+  SELECT
+    FORMAT_TIMESTAMP("%F %H", period_start_hour, "America/New_York") AS period_start_hour,
+    COUNT(DISTINCT job_id) AS num_jobs_in_period,
+    SUM(period_slot_ms)/1000/60/60 AS period_total_slot_hours,
+    MAX(p90_job_period_slots) AS max_p90_job_period_slots,
+    ARRAY_AGG(STRUCT(
+      period_slot_ms/1000/60/60 AS period_slot_hours,
+      p90_job_period_slots,
+      avg_job_period_slots,
+      duration_ms,
+      avg_period_estimated_runnable_units,
+      slot_hunger,
+      query_hash,
+      num_days_query_hash_active,
+      labels,
+      bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+      COALESCE(bqutil.fn.job_url(project_id || ':us.' || parent_job_id), '') AS parent_job_url,
+      user_email
+    )) AS jobs_per_hour,
+  FROM(
+    SELECT
+      TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York") AS period_start_hour,
+      project_id,
+      parent_job_id,
+      job_id,
+      user_email,
+      AVG(COALESCE(period_estimated_runnable_units,0)) AS avg_period_estimated_runnable_units,
+      SUM(IF(COALESCE(period_estimated_runnable_units,0) >= 1000,1,0)) AS slot_hunger,
+      SUM(period_slot_ms) AS period_slot_ms,
+      ANY_VALUE(p90_job_period_slots) AS p90_job_period_slots,
+      ANY_VALUE(avg_job_period_slots) AS avg_job_period_slots,
+    FROM(
+      SELECT *,
+        PERCENTILE_CONT(period_slot_ms/1000, 0.9) OVER(
+            PARTITION BY TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York"), job_id
+          ) AS p90_job_period_slots,
+        AVG(period_slot_ms/1000) OVER(
+          PARTITION BY TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York"), job_id
+        ) AS avg_job_period_slots,
+       
+      FROM `region-us`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION
+      WHERE
+        DATE(job_creation_time) >= CURRENT_DATE - num_days_to_scan
+        AND reservation_id = my_reservation_id
+        AND state IN ('RUNNING','DONE')
+        AND statement_type != 'SCRIPT'
+        AND period_slot_ms > 0
+        -- Uncomment below to ignore slots consumed by failed queries
+        -- AND error_result IS NULL
+    )
+    GROUP BY 1,2,3,4,5
+  )
+  JOIN (
+    SELECT
+      job_id,
+      query_info.query_hashes.normalized_literals AS query_hash,
+      labels,
+      TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS duration_ms,
+      COUNT(DISTINCT DATE(creation_time)) OVER(
+        PARTITION BY query_info.query_hashes.normalized_literals
+      ) AS num_days_query_hash_active,
+    FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+    WHERE
+      DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+      AND reservation_id = my_reservation_id
+      AND statement_type != 'SCRIPT'
+      AND total_slot_ms > 0
+      -- Uncomment below to ignore slots consumed by failed queries
+      -- AND error_result IS NULL
+  ) USING(job_id)
+  GROUP BY period_start_hour
+);
\ No newline at end of file
diff --git a/scripts/optimization/queries_grouped_by_hash.sql b/scripts/optimization/queries_grouped_by_hash.sql
new file mode 100644
index 000000000..e4bbfdce0
--- /dev/null
+++ b/scripts/optimization/queries_grouped_by_hash.sql
@@ -0,0 +1,84 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, top_bytes_scanning_queries_by_hash, 
+ * which contains the top 200 most expensive queries by total bytes scanned
+ * within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ * Queries are grouped by their normalized query pattern, which ignores
+ * comments, parameter values, UDFs, and literals in the query text.
+ * This allows us to group queries that are logically the same, but
+ * have different literals. 
+ * 
+ * For example, the following queries would be grouped together:
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01'
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02'
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03'
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS (
+  COALESCE((
+    SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0)) 
+    FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0)
+  + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0)
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash AS
+SELECT
+  statement_type,
+  query_info.query_hashes.normalized_literals                              AS query_hash,
+  COUNT(DISTINCT DATE(start_time))                                         AS days_active,
+  ARRAY_AGG(DISTINCT project_id IGNORE NULLS)                              AS project_ids,
+  ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS)                          AS reservation_ids,
+  SUM(num_stages_with_perf_insights(query_info))                           AS num_stages_with_perf_insights,
+  COUNT(DISTINCT (project_id || ':us.' || job_id))                         AS job_count,
+  ARRAY_AGG(
+    STRUCT(
+      bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+      bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url
+    )
+    ORDER BY total_slot_ms
+    DESC LIMIT 10)                                                         AS top_10_job_urls,
+  ARRAY_AGG(DISTINCT user_email)                                           AS user_emails,
+  SUM(total_bytes_processed) / POW(1024, 3)                                AS total_gigabytes_processed,
+  AVG(total_bytes_processed) / POW(1024, 3)                                AS avg_gigabytes_processed,
+  SUM(total_slot_ms) / (1000 * 60 * 60)                                    AS total_slot_hours,
+  AVG(total_slot_ms) / (1000 * 60 * 60)                                    AS avg_total_slot_hours_per_active_day,
+  AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) )                       AS avg_job_duration_seconds,
+  ARRAY_AGG(DISTINCT FORMAT("%T",labels))                                  AS labels,
+  SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS total_slots,
+  AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS avg_total_slots,
+  -- query hashes will all have the same referenced tables so we can use ANY_VALUE below
+  ANY_VALUE(ARRAY(
+    SELECT 
+      ref_table.project_id || '.' || 
+      IF(STARTS_WITH(ref_table.dataset_id, '_'), 'TEMP', ref_table.dataset_id)
+      || '.' || ref_table.table_id
+    FROM UNNEST(referenced_tables) ref_table
+  ))                                                                       AS referenced_tables,
+FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE 
+  DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+  AND state = 'DONE'
+  AND error_result IS NULL
+  AND job_type = 'QUERY'
+  AND statement_type != 'SCRIPT' 
+GROUP BY statement_type, query_hash;
diff --git a/scripts/optimization/queries_grouped_by_labels.sql b/scripts/optimization/queries_grouped_by_labels.sql
new file mode 100644
index 000000000..77d5514cb
--- /dev/null
+++ b/scripts/optimization/queries_grouped_by_labels.sql
@@ -0,0 +1,41 @@
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS (
+  COALESCE((
+    SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0))
+    FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0)
+  + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0)
+);
+
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_labels AS
+SELECT
+  FORMAT("%T", labels)                                                     AS labels,
+  COUNT(DISTINCT DATE(start_time))                                         AS days_active,
+  ARRAY_AGG(DISTINCT project_id IGNORE NULLS)                              AS project_ids,
+  ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS)                          AS reservation_ids,
+  SUM(num_stages_with_perf_insights(query_info))                 AS num_stages_with_perf_insights,
+  COUNT(DISTINCT (project_id || ':us.' || job_id))                         AS job_count,
+  ARRAY_AGG(
+    STRUCT(
+      bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+      bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url
+    )
+    ORDER BY total_slot_ms
+    DESC LIMIT 10)                                                         AS top_10_job_urls,
+  ARRAY_AGG(DISTINCT user_email)                                           AS user_emails,
+  SUM(total_bytes_processed) / POW(1024, 3)                                AS total_gigabytes_processed,
+  AVG(total_bytes_processed) / POW(1024, 3)                                AS avg_gigabytes_processed,
+  SUM(total_slot_ms) / (1000 * 60 * 60)                                    AS total_slot_hours,
+  SUM(total_slot_ms) / (1000 * 60 * 60) / COUNT(DISTINCT DATE(start_time)) AS avg_total_slot_hours_per_active_day,
+  AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) )                       AS avg_job_duration_seconds,
+  SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS total_slots,
+  AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS avg_total_slots,
+FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+  DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+  AND state = 'DONE'
+  AND error_result IS NULL
+  AND job_type = 'QUERY'
+  AND statement_type != 'SCRIPT'
+  AND user_email LIKE '%gserviceaccount.com'
+GROUP BY labels;
\ No newline at end of file
diff --git a/scripts/optimization/queries_grouped_by_script.sql b/scripts/optimization/queries_grouped_by_script.sql
new file mode 100644
index 000000000..e7803cff8
--- /dev/null
+++ b/scripts/optimization/queries_grouped_by_script.sql
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE TEMP FUNCTION job_stage_max_slots(job_stages ANY TYPE) AS ((
+  SELECT MAX(SAFE_DIVIDE(stage.slot_ms,stage.end_ms - stage.start_ms)) 
+  FROM UNNEST(job_stages) stage
+));
+CREATE TEMP FUNCTION total_bytes_shuffled(job_stages ANY TYPE) AS ((
+  SELECT SUM(stage.shuffle_output_bytes) 
+  FROM UNNEST(job_stages) stage
+));
+CREATE TEMP FUNCTION total_shuffle_bytes_spilled(job_stages ANY TYPE) AS ((
+  SELECT SUM(stage.shuffle_output_bytes_spilled) 
+  FROM UNNEST(job_stages) stage
+));
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_script AS
+SELECT * REPLACE((
+  SELECT
+    ARRAY_AGG(DISTINCT table.project_id || "." || table.dataset_id || "." || table.table_id)
+  FROM UNNEST(referenced_tables) table
+  ) AS referenced_tables)
+FROM(
+  SELECT
+    bqutil.fn.job_url(
+        parent.project_id || ':us.' || parent.job_id)              AS job_url,
+    parent.user_email,
+    parent.start_time,
+    ANY_VALUE(parent.labels) AS labels,
+    SUM(COALESCE(SAFE_DIVIDE(child.total_slot_ms, TIMESTAMP_DIFF(
+      child.end_time, child.start_time, MILLISECOND)), 0))         AS total_slots,
+    SUM(COALESCE(child.total_slot_ms, 0))                          AS total_slot_ms,
+    SUM(COALESCE(child.total_slot_ms / (1000 * 60 * 60), 0))       AS total_slot_hours,
+    SUM(COALESCE(child.total_bytes_processed, 0)) / POW(1024, 3)   AS total_gigabytes_processed,
+    SUM(COALESCE(child.total_bytes_processed, 0)) / POW(1024, 4)   AS total_terabytes_processed,
+    ARRAY_CONCAT_AGG(child.referenced_tables)                      AS referenced_tables,
+    ARRAY_AGG(STRUCT(
+      bqutil.fn.job_url(
+        child.project_id || ':us.' || child.job_id)                AS job_url,
+      child.reservation_id                                         AS reservation_id,
+      EXTRACT(DATE FROM child.creation_time)                       AS creation_date,
+      TIMESTAMP_DIFF(child.end_time, child.start_time, SECOND)     AS job_duration_seconds,
+      child.total_slot_ms                                          AS total_slot_ms,
+      SAFE_DIVIDE(child.total_slot_ms,TIMESTAMP_DIFF(
+        child.end_time, child.start_time, MILLISECOND))            AS job_avg_slots,
+      job_stage_max_slots(child.job_stages)                        AS job_stage_max_slots,
+      total_bytes_shuffled(child.job_stages)                       AS total_bytes_shuffled,
+      total_shuffle_bytes_spilled(child.job_stages)                AS total_shuffle_bytes_spilled) 
+      ORDER BY child.start_time
+    )                                                              AS children_jobs_details
+  FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION AS parent
+  JOIN `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION AS child
+  ON parent.job_id = child.parent_job_id
+  WHERE 
+    DATE(parent.creation_time) >= CURRENT_DATE - num_days_to_scan
+    AND parent.state = 'DONE'
+    AND parent.error_result IS NULL
+  GROUP BY job_url, user_email, start_time
+  HAVING 
+    ARRAY_LENGTH(children_jobs_details) > 1 
+    AND total_slot_ms > 0);
diff --git a/scripts/optimization/query_performance_insights.sql b/scripts/optimization/query_performance_insights.sql
new file mode 100644
index 000000000..1e0218893
--- /dev/null
+++ b/scripts/optimization/query_performance_insights.sql
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script retrieves the top 100 queries that have had performance insights
+ * generated for them in the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.query_performance_insights AS
+SELECT
+  bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+  query_info.performance_insights AS performance_insights
+FROM
+  `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+WHERE
+  DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+  AND job_type = 'QUERY'
+  AND state = 'DONE'
+  AND error_result IS NULL
+  AND statement_type != 'SCRIPT'
+  AND EXISTS ( -- Only include queries which had performance insights
+    SELECT 1
+    FROM UNNEST(
+      query_info.performance_insights.stage_performance_standalone_insights
+    )
+    WHERE slot_contention OR insufficient_shuffle_quota 
+    UNION ALL
+    SELECT 1
+    FROM UNNEST(
+      query_info.performance_insights.stage_performance_change_insights
+    )
+    WHERE input_data_change.records_read_diff_percentage IS NOT NULL
+  );
diff --git a/scripts/optimization/run_all_scripts.sh b/scripts/optimization/run_all_scripts.sh
new file mode 100644
index 000000000..596dca24a
--- /dev/null
+++ b/scripts/optimization/run_all_scripts.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Exit immediately if a command exits with a non-zero status.
+set -e
+# Set the following flags for the bq command:
+#   --quiet: suppress status updates while jobs are running
+#   --nouse_legacy_sql: use standard SQL syntax
+#   --nouse_cache: do not use cached results
+bq_flags="--quiet --nouse_legacy_sql --nouse_cache"
+
+# Run all the .sql files in the current directory in parallel,
+# except for table_read_patterns.sql
+# and actively_read_tables_with_partitioning_clustering_info.sql
+# since they'll be run sequentially due to the depedency between them.
+for f in *.sql; do
+  if [[ $f = "table_read_patterns.sql" ||
+    $f = "actively_read_tables_with_partitioning_clustering_info.sql" ]]; then
+    # Skip this file, it's already been run
+    continue
+  fi
+  bq query ${bq_flags} < $f &
+done
+
+# Run the table_read_patterns.sql file first because it's a dependency for
+# actively_read_tables_with_partitioning_clustering_info.sql
+bq query ${bq_flags} <table_read_patterns.sql
+bq query ${bq_flags} <actively_read_tables_with_partitioning_clustering_info.sql &
diff --git a/scripts/optimization/table_read_patterns.sql b/scripts/optimization/table_read_patterns.sql
new file mode 100644
index 000000000..4fdef9e00
--- /dev/null
+++ b/scripts/optimization/table_read_patterns.sql
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * The following script creates a table named, table_read_patterns,
+ * that contains a list of the most frequently read tables within the
+ * past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT ARRAY_AGG(DISTINCT project_id)
+  FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+  WHERE DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+);
+
+/*
+-- Uncomment this block if you just want to scan the top 1000 projects
+-- by total bytes billed in the past 30 days.
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT ARRAY_AGG(project_id)
+  FROM(
+    SELECT project_id
+    FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+    WHERE DATE(creation_time) >= CURRENT_DATE - num_days_to_scan
+    GROUP BY 1
+    ORDER BY SUM(total_bytes_billed) DESC
+    LIMIT 1000
+));
+*/
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns
+(
+  date DATE,
+  project_id STRING,
+  dataset_id STRING,
+  table_id STRING,
+  full_table_id STRING,
+  job_id STRING,
+  job_url STRING,
+  parent_job_id STRING,
+  parent_job_url STRING,
+  reservation_id STRING,
+  total_bytes_billed INT64,
+  total_slot_ms INT64,
+  creation_time TIMESTAMP,
+  start_time TIMESTAMP,
+  end_time TIMESTAMP,
+  stage_name STRING,
+  stage_id INT64,
+  stage_slot_ms INT64,
+  total_job_read_slot_ms INT64,
+  records_read INT64,
+  records_written INT64,
+  shuffle_output_bytes INT64,
+  shuffle_output_bytes_spilled INT64,
+  parallel_inputs INT64,
+  read_ratio_avg FLOAT64,
+  read_ms_avg INT64,
+  wait_ratio_avg FLOAT64,
+  wait_ms_avg INT64,
+  compute_ratio_avg FLOAT64,
+  compute_ms_avg INT64,
+  write_ratio_avg FLOAT64,
+  write_ms_avg INT64,
+  predicates ARRAY<STRUCT<operator STRING, column STRING, value STRING>>
+) CLUSTER BY project_id, dataset_id, table_id;
+
+CREATE TEMP FUNCTION mapColumns(where_clause STRING, column_list ARRAY<STRING>)
+RETURNS STRING
+LANGUAGE js AS r"""
+  if (!where_clause) {
+    return null;
+  }
+  column_list.forEach(function(col) {
+    const tokens = col.split(":");
+    where_clause = where_clause.replaceAll(tokens[0].trim(), tokens[1]);
+  });
+  return where_clause;
+""";
+
+FOR p IN (
+ SELECT project_id
+ FROM
+   UNNEST(projects) project_id
+)
+DO
+BEGIN
+EXECUTE IMMEDIATE FORMAT("""
+INSERT INTO optimization_workshop.table_read_patterns
+  SELECT
+    DATE(creation_time) AS date,
+    project_id,
+    IF(ARRAY_LENGTH(SPLIT(table_id, '.'))=2, SPLIT(table_id, '.')[OFFSET(0)], SPLIT(table_id, '.')[OFFSET(1)]) AS dataset_id,
+    SPLIT(table_id, '.')[ORDINAL(ARRAY_LENGTH(SPLIT(table_id, '.')))] AS table_id,
+    IF(ARRAY_LENGTH(SPLIT(table_id, '.'))=2, project_id || '.' || table_id, table_id) AS full_table_id,
+    job_id,
+    bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url,
+    parent_job_id,
+    bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url,
+    reservation_id,
+    total_bytes_billed,
+    total_slot_ms,
+    creation_time,
+    start_time,
+    end_time,
+    stage_name,
+    stage_id,
+    stage_slot_ms,
+    total_job_read_slot_ms,
+    records_read,
+    records_written,
+    shuffle_output_bytes,
+    shuffle_output_bytes_spilled,
+    parallel_inputs,
+    read_ratio_avg,
+    read_ms_avg,
+    wait_ratio_avg,
+    wait_ms_avg,
+    compute_ratio_avg,
+    compute_ms_avg,
+    write_ratio_avg,
+    write_ms_avg,
+    ARRAY(
+      SELECT STRUCT(
+        REGEXP_EXTRACT(predicate, '^[[:word:]]+') AS operator,
+        REGEXP_EXTRACT(predicate, '[(]([[:word:]]+)') AS column,
+        REGEXP_EXTRACT(predicate, '[,](.+)[)]') AS value )
+      FROM UNNEST(filters) AS predicate
+    ) AS predicates
+  FROM (
+    SELECT *,
+      REGEXP_EXTRACT_ALL(
+        mapcolumns(where_clause, projection_list),
+        '[[:word:]]+[(][^()]*?[)]') AS filters
+    FROM (
+      SELECT
+        jbp.project_id,
+        job_id,
+        parent_job_id,
+        reservation_id,
+        total_bytes_billed,
+        total_slot_ms,
+        creation_time,
+        start_time,
+        end_time,
+        js.name AS stage_name,
+        js.id AS stage_id,
+        SUM(js.slot_ms) OVER (PARTITION BY job_id) AS total_job_read_slot_ms,
+        js.slot_ms AS stage_slot_ms,
+        js.records_read,
+        js.records_written,
+        js.shuffle_output_bytes,
+        js.shuffle_output_bytes_spilled,
+        js.parallel_inputs,
+        js.read_ratio_avg,
+        js.read_ms_avg,
+        js.wait_ratio_avg,
+        js.wait_ms_avg,
+        js.compute_ratio_avg,
+        js.compute_ms_avg,
+        js.write_ratio_avg,
+        js.write_ms_avg,
+        SPLIT(js_steps.substeps[safe_OFFSET(0)], ',') AS projection_list,
+        REPLACE(js_steps.substeps[safe_OFFSET(1)],'FROM ', '') AS table_id,
+        js_steps.substeps[safe_OFFSET(2)] AS where_clause
+      FROM `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT jbp
+      JOIN UNNEST(job_stages) AS js
+      JOIN UNNEST(steps) AS js_steps
+      WHERE
+        DATE(creation_time) >= CURRENT_DATE - %i
+        AND js_steps.kind = 'READ'
+        AND jbp.job_type = 'QUERY'
+        AND jbp.statement_type != 'SCRIPT'
+        AND NOT cache_hit
+        AND error_result IS NULL
+        AND NOT EXISTS ( -- Exclude queries over INFORMATION_SCHEMA
+          SELECT 1
+          FROM UNNEST(js_steps.substeps) AS substeps
+          WHERE substeps LIKE 'FROM %%.INFORMATION_SCHEMA.%%')
+        AND EXISTS ( -- Only include substeps with a FROM clause
+          SELECT 1
+          FROM UNNEST(js_steps.substeps) AS substeps
+          WHERE substeps LIKE 'FROM %%.%%')
+))""", p.project_id, num_days_to_scan);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/table_read_patterns_by_query_hash.sql b/scripts/optimization/table_read_patterns_by_query_hash.sql
new file mode 100644
index 000000000..dd69ec428
--- /dev/null
+++ b/scripts/optimization/table_read_patterns_by_query_hash.sql
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, table_read_patterns_by_query_hash
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns_by_query_hash
+CLUSTER BY project_id, dataset_id, table_id AS
+SELECT
+  project_id,
+  dataset_id,
+  table_id,
+  COUNT(query_pattern) AS num_query_patterns,
+  ARRAY_AGG(
+    STRUCT(
+      query_pattern,
+      top_10_slot_ms_jobs, 
+      avg_slot_hours,
+      days_active,
+      job_count,
+      total_gigabytes_processed,
+      avg_gigabytes_processed,
+      total_slot_hours,
+      avg_total_slot_hours_per_active_day,
+      avg_job_duration_seconds,
+      total_slots,
+      avg_total_slots
+      )
+    ORDER BY avg_slot_hours * days_active * job_count DESC LIMIT 10
+  ) AS top_10_slot_ms_patterns,
+  MAX(days_active) AS max_days_active,
+  MAX(avg_slot_hours) AS max_avg_slot_hours_across_all_patterns,
+  MAX(days_active * avg_slot_hours) AS max_weighted_avg_slot_hours,
+FROM(
+  SELECT
+    referenced_table.project_id,
+    referenced_table.dataset_id,
+    referenced_table.table_id,
+    query_info.query_hashes.normalized_literals                              AS query_pattern,
+    ARRAY_AGG(STRUCT(
+      bqutil.fn.job_url(jbo.project_id || ':us.' || job_id) AS job_url, 
+      bqutil.fn.job_url(jbo.project_id || ':us.' || parent_job_id) AS parent_job_url
+      )
+      ORDER BY total_slot_ms DESC LIMIT 10
+    )                                                                        AS top_10_slot_ms_jobs,
+    COUNT(DISTINCT DATE(start_time))                                         AS days_active,
+    ARRAY_AGG(DISTINCT jbo.project_id IGNORE NULLS)                          AS project_ids,
+    ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS)                          AS reservation_ids,
+    COUNT(DISTINCT job_Id)                                                   AS job_count,
+    ARRAY_AGG(DISTINCT user_email)                                           AS user_emails,
+    SUM(total_bytes_processed) / POW(1024, 3)                                AS total_gigabytes_processed,
+    AVG(total_bytes_processed) / POW(1024, 3)                                AS avg_gigabytes_processed,
+    SUM(total_slot_ms) / (1000 * 60 * 60)                                    AS total_slot_hours,
+    AVG(total_slot_ms) / (1000 * 60 * 60)                                    AS avg_slot_hours,
+    SUM(total_slot_ms) / (1000 * 60 * 60) / COUNT(DISTINCT DATE(start_time)) AS avg_total_slot_hours_per_active_day,
+    AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) )                       AS avg_job_duration_seconds,
+    SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS total_slots,
+    AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND))   AS avg_total_slots,
+  FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo, UNNEST(referenced_tables) referenced_table
+  WHERE
+    DATE(start_time) >= CURRENT_DATE - num_days_to_scan
+    AND (
+      referenced_table.project_id||referenced_table.dataset_id||referenced_table.table_id
+      <> destination_table.project_id||destination_table.dataset_id||destination_table.table_id
+    )
+    AND state = 'DONE'
+    AND error_result IS NULL
+    AND job_type = 'QUERY'
+    AND statement_type != 'SCRIPT'
+    AND referenced_table.table_id NOT LIKE '%INFORMATION_SCHEMA%'
+    AND user_email LIKE '%gserviceaccount.com'
+  GROUP BY 1,2,3,4
+) GROUP BY 1,2,3;
diff --git a/scripts/optimization/tables_without_partitioning_or_clustering.sql b/scripts/optimization/tables_without_partitioning_or_clustering.sql
new file mode 100644
index 000000000..578309368
--- /dev/null
+++ b/scripts/optimization/tables_without_partitioning_or_clustering.sql
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, tables_without_part_clust,
+ * that contains a list of the largest tables which are:
+ *     - not partitioned
+ *     - not clustered
+ *     - neither partitioned nor clustered
+ */
+
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT 
+    ARRAY_AGG(project_id)
+  FROM(
+    SELECT project_id
+    FROM `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION`
+    WHERE NOT deleted 
+    GROUP BY 1
+    ORDER BY SUM(total_logical_bytes) DESC
+    LIMIT 100
+  )
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.tables_without_part_clust
+(
+  table_catalog STRING,
+  table_schema STRING,
+  table_name STRING,
+  table_url STRING,
+  partitioning_column STRING,
+  clustering_columns STRING,
+  logical_gigabytes FlOAT64,
+  logical_terabytes FLOAT64
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+   UNNEST(projects) project_id
+)
+DO 
+BEGIN
+  EXECUTE IMMEDIATE FORMAT(
+  """  
+  INSERT INTO
+    optimization_workshop.tables_without_part_clust
+  SELECT
+    s.table_catalog,
+    s.table_schema,
+    s.table_name,
+    bqutil.fn.table_url(s.table_catalog || '.' || s.table_schema || '.' || s.table_name) AS table_url,
+    partitioning_column,
+    clustering_columns,
+    SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes,
+    SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes,
+  FROM
+    `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s
+  JOIN (
+    SELECT
+      table_catalog,
+      table_schema,
+      table_name,
+      STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column,
+      STRING_AGG(
+        CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END
+        ORDER BY CLUSTERING_ORDINAL_POSITION
+      ) AS clustering_columns
+    FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS`
+    GROUP BY 1,2,3
+  ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name)
+  WHERE
+    clustering_columns IS NULL OR partitioning_column IS NULL
+  GROUP BY
+    s.table_catalog,
+    s.table_schema,
+    s.table_name,
+    table_url,
+    partitioning_column,
+    clustering_columns;
+  """,
+  p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/viewable_queries_grouped_by_hash.sql b/scripts/optimization/viewable_queries_grouped_by_hash.sql
new file mode 100644
index 000000000..ffd275f91
--- /dev/null
+++ b/scripts/optimization/viewable_queries_grouped_by_hash.sql
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, top_bytes_scanning_queries_by_hash, 
+ * which contains the top 200 most expensive queries by total bytes scanned
+ * within the past 30 days.
+ * 30 days is the default timeframe, but you can change this by setting the
+ * num_days_to_scan variable to a different value.
+ * Queries are grouped by their normalized query pattern, which ignores
+ * comments, parameter values, UDFs, and literals in the query text.
+ * This allows us to group queries that are logically the same, but
+ * have different literals. 
+ * 
+ * For example, the following queries would be grouped together:
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01'
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02'
+ *   SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03'
+ */
+
+DECLARE num_days_to_scan INT64 DEFAULT 30;
+
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT 
+    ARRAY_AGG(project_id)
+  FROM(
+    SELECT project_id
+    FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION
+    WHERE DATE(creation_time) >= CURRENT_DATE - 30
+    GROUP BY 1
+    ORDER BY SUM(total_bytes_billed) DESC
+    LIMIT 100
+  )
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.viewable_queries_grouped_by_hash
+(
+  Query_Hash STRING,
+  Query_Raw_Sample STRING,
+  Job_Origin STRING,
+  Ref_Tables STRING,
+  Days_Active INT64,
+  Job_Count INT64,
+  Avg_Job_Count_Active_Days INT64,
+  Project_Id STRING,
+  BQ_Region STRING,
+  Reservation_Id STRING,
+  Total_Gigabytes_Processed INT64,
+  Total_Gigabytes_Processed_Per_Job INT64,
+  Avg_Gigabytes_Processed INT64,
+  Total_Slot_Hours INT64,
+  Avg_Total_Slot_Hours_per_Active_Day INT64,
+  Avg_Job_Duration_Seconds INT64,
+  Any_Job_Ids ARRAY<STRING>,
+  User_Emails STRING,
+  Labels STRING
+);
+
+FOR p IN (
+ SELECT project_id
+ FROM
+   UNNEST(projects) project_id
+)
+DO
+BEGIN
+EXECUTE IMMEDIATE FORMAT("""
+INSERT INTO optimization_workshop.viewable_queries_grouped_by_hash
+SELECT
+    query_hash                                                              AS Query_Hash,
+    ANY_VALUE(query_raw)                                                    AS Query_Raw_Sample,
+    SPLIT(ANY_VALUE(job_ids)[OFFSET(0)], '_')[OFFSET(0)]                    AS Job_Origin,
+    Ref_Tables                                                              AS Ref_Tables,
+    COUNT(DISTINCT creation_dt)                                             AS Days_Active,
+    SUM(job_count)                                                          AS Job_Count,
+    CAST(AVG(job_count) AS INT64)                                           AS Avg_Job_Count_Active_Days,
+    Project_Id                                                              AS Project_Id,
+    'us'                                                                    AS BQ_Region,
+    Reservation_Id                                                          AS Reservation_Id,
+    CAST(SUM(total_gigabytes_processed) AS INT64)                           AS Total_Gigabytes_Processed,
+    CAST(SUM(total_gigabytes_processed)/sum(job_count) AS INT64)            AS Total_Gigabytes_Processed_Per_Job,
+    CAST(AVG(total_gigabytes_processed) AS INT64)                           AS Avg_Gigabytes_Processed,
+    CAST(SUM(total_slot_hours_per_day) AS INT64)                            AS Total_Slot_Hours,
+    CAST(AVG(total_slot_hours_per_day) AS INT64)                            AS Avg_Total_Slot_Hours_per_Active_Day,
+    CAST(AVG(avg_job_duration_seconds) AS INT64)                            AS Avg_Job_Duration_Seconds,
+    ANY_VALUE(job_ids)                                                      AS Any_Job_Ids,
+    STRING_AGG(DISTINCT user_emails_unnest)                                 AS User_Emails,
+    STRING_AGG(DISTINCT labels_concat)                                      AS Labels
+FROM (
+    SELECT
+        query_hash,
+        ANY_VALUE(query_raw)                                    AS query_raw,
+        ref_tables                                              AS ref_tables,
+        creation_dt                                             AS creation_dt,
+        project_id                                              AS project_id,
+        reservation_id                                          AS reservation_id,
+        COUNT(*)                                                AS job_count,
+        ARRAY_AGG(job_id ORDER BY total_slot_ms DESC LIMIT 10)  AS job_ids,
+        SUM(total_slot_ms) / (1000 * 60 * 60)                   AS total_slot_hours_per_day,
+        SUM(total_bytes_processed) / POW(1024, 3)               AS total_gigabytes_processed, 
+        AVG(job_duration_seconds)                               AS avg_job_duration_seconds,
+        ARRAY_AGG(DISTINCT user_email)                          AS user_emails,
+        STRING_AGG(DISTINCT labels_concat)                      AS labels_concat
+    FROM (
+        SELECT
+            query_info.query_hashes.normalized_literals                                     AS query_hash,
+            query                                                                           AS query_raw,
+            DATE(jbp.creation_time)                                                         AS creation_dt,
+            jbp.project_id                                                                  AS project_id,
+            jbp.reservation_id                                                              AS reservation_id,
+            jbp.job_id                                                                      AS job_id,
+            jbp.total_bytes_processed                                                       AS total_bytes_processed,
+            jbp.total_slot_ms                                                               AS total_slot_ms,
+            jbp.total_slot_ms / TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, MILLISECOND)   AS slots,
+            TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, SECOND)                            AS job_duration_seconds,
+            user_email,
+            STRING_AGG(ref_tables.project_id || '.' ||
+                IF
+                (STARTS_WITH(ref_tables.dataset_id, '_'),
+                'TEMP',
+                ref_tables.dataset_id) || '.' || ref_tables.table_id
+                ORDER BY
+                    ref_tables.project_id || '.' ||
+                IF
+                (STARTS_WITH(ref_tables.dataset_id, '_'),
+                'TEMP',
+                ref_tables.dataset_id) || '.' || ref_tables.table_id)                       AS ref_tables,
+            FORMAT("%%T", ARRAY_CONCAT_AGG(labels))                                         AS labels_concat 
+        FROM  
+            `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT as jbp
+        JOIN 
+            UNNEST(referenced_tables) ref_tables
+        WHERE 
+            DATE(jbp.creation_time) >= CURRENT_DATE - %i
+            AND jbp.end_time > jbp.start_time
+            AND jbp.error_result IS NULL
+            AND jbp.job_type = 'QUERY'
+            AND jbp.statement_type != 'SCRIPT'
+            AND ref_tables.table_id not like '%%INFORMATION_SCHEMA%%' 
+        GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11
+    )
+    GROUP BY 1, 3, 4, 5, 6)
+JOIN
+    UNNEST(user_emails) as user_emails_unnest
+GROUP BY
+    Query_Hash,
+    Ref_Tables,
+    Project_Id,
+    BQ_Region,
+    Reservation_Id;
+""",
+p.project_id, num_days_to_scan);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;
diff --git a/scripts/optimization/views_with_nonoptimal_join_condition.sql b/scripts/optimization/views_with_nonoptimal_join_condition.sql
new file mode 100644
index 000000000..0ee15d0ec
--- /dev/null
+++ b/scripts/optimization/views_with_nonoptimal_join_condition.sql
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * This script creates a table named, views_with_nonoptimal_join_condition,
+ * that contains a list of views with non-optimal join conditions.
+ */
+
+DECLARE projects ARRAY<STRING> DEFAULT (
+  SELECT ARRAY_AGG(DISTINCT project_id)
+  FROM `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION`
+  WHERE NOT deleted
+);
+
+CREATE SCHEMA IF NOT EXISTS optimization_workshop;
+CREATE OR REPLACE TABLE optimization_workshop.views_with_nonoptimal_join_condition
+(
+  view_name STRING,
+  view_url STRING,
+  view_definition STRING,
+  join_conditions ARRAY<STRING>
+);
+
+CREATE TEMP FUNCTION extract_nonoptimal_join_conditions(view_definition STRING) AS(
+  ARRAY_CONCAT(
+    REGEXP_EXTRACT_ALL(
+      REGEXP_REPLACE(UPPER(view_definition), r"\sON\s", "\nON "),
+      r"\nON\s+[A-Z_]+?\([^=]*?=[^=]*"
+    ),
+    REGEXP_EXTRACT_ALL(
+      REGEXP_REPLACE(UPPER(view_definition), r"\sON\s", "\nON "),
+      r"\nON\s+[^=]*?=\s*[A-Z_]+?\([^=]*"
+)));
+
+FOR p IN (
+ SELECT project_id
+ FROM
+   UNNEST(projects) project_id
+)
+DO 
+BEGIN
+  EXECUTE IMMEDIATE FORMAT(r"""
+  INSERT INTO optimization_workshop.views_with_nonoptimal_join_condition
+  SELECT
+  table_name AS view_name,
+  bqutil.fn.table_url(table_catalog || '.' || table_schema || '.' || table_name) AS view_url,
+  view_definition,
+  extract_nonoptimal_join_conditions(view_definition) AS join_conditions
+  FROM
+    `%s.region-us.INFORMATION_SCHEMA.VIEWS`
+  WHERE 
+    ARRAY_LENGTH(extract_nonoptimal_join_conditions(view_definition)) >= 1
+  """,
+  p.project_id);
+EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors
+END;
+END FOR;