From f1e0bba812fa74eba8b5d3548041d5a59f13dc2d Mon Sep 17 00:00:00 2001 From: Daniel De Leo Date: Mon, 20 Nov 2023 11:14:35 -0500 Subject: [PATCH] Adding BigQuery optimization scripts (#376) * add optimization scripts * Add script to README for enabling recommender API and add new script for investigating tables with high freq DML * add comments * Adding comments to existing scripts * add default to variable * linting * add script for harvesting query perf insights * nit * improve and add comments * remove query hashing UDF and instead use existing query hash field in the info schema views * Update docs * remove unnecessary check * update largest tables scripts (#375) * Add frequent_daily_table_dml to README and nits * markdown formatting * revert formatting * remove experimental script * standardizing scripts to use the same num_days_to_scan variable * refactor scripts to not do any sorting * add missing INSERT * revert * remove unnecessary INSERTs and use bqutil UDF * comments fixes * fix regex on nonoptimal view script * run table_read_patterns script before other scripts * remove unnecessary alias * remove unnecessary filter * only check for completed queries * fix perf insights sql * simplify queries_grouped_by_hash.sql by making it query over the jobs_by_org view and removed a lot of the unnecessary sql logic * Adding initial commit of queries_grouped_by_script.sql * Add a version of queries_grouped_by_hash.sql that retains the raw sql query in case it's useful in future * aggregate project and reservation ids * use start_time instead of create_time when scanning history * aggregate ref_tables as an array instead of comma separated string * remove unnecessary column * simplify check for perf insights * improve query insights scripts and readme * readme fixes and adding api enabling script * linting * add --quiet to bq tool * Updates * Update table_read_patterns_by_query_hash.sql * Update table_read_patterns_by_query_hash.sql * Update table_read_patterns_by_query_hash.sql * Update table_read_patterns_by_query_hash.sql * Update queries_grouped_by_script.sql * Update queries_grouped_by_script.sql * Add script queries_grouped_by_labels.sql * order by total_slot_ms only * fixes * add hourly_slot_consumption_by_query_hash.sql * move file --------- Co-authored-by: ddeleo Co-authored-by: Christian Yarros Co-authored-by: Raj Patel <57837394+rajc242@users.noreply.github.com> --- scripts/README.md | 3 +- scripts/optimization/README.md | 443 ++++++++++++++++++ ...bles_with_partitioning_clustering_info.sql | 121 +++++ ...bles_with_partitioning_clustering_info.sql | 103 ++++ .../table_read_patterns.sql | 57 +++ .../optimization/daily_project_analysis.sql | 94 ++++ .../optimization/enable_recommender_api.sh | 30 ++ .../optimization/frequent_daily_table_dml.sql | 52 ++ .../hourly_slot_consumption_by_query_hash.sql | 128 +++++ .../optimization/queries_grouped_by_hash.sql | 84 ++++ .../queries_grouped_by_labels.sql | 41 ++ .../queries_grouped_by_script.sql | 77 +++ .../query_performance_insights.sql | 51 ++ scripts/optimization/run_all_scripts.sh | 41 ++ scripts/optimization/table_read_patterns.sql | 205 ++++++++ .../table_read_patterns_by_query_hash.sql | 90 ++++ ...les_without_partitioning_or_clustering.sql | 99 ++++ .../viewable_queries_grouped_by_hash.sql | 168 +++++++ .../views_with_nonoptimal_join_condition.sql | 70 +++ 19 files changed, 1956 insertions(+), 1 deletion(-) create mode 100644 scripts/optimization/README.md create mode 100644 scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql create mode 100644 scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql create mode 100644 scripts/optimization/assured_workloads_compliant/table_read_patterns.sql create mode 100644 scripts/optimization/daily_project_analysis.sql create mode 100644 scripts/optimization/enable_recommender_api.sh create mode 100644 scripts/optimization/frequent_daily_table_dml.sql create mode 100644 scripts/optimization/hourly_slot_consumption_by_query_hash.sql create mode 100644 scripts/optimization/queries_grouped_by_hash.sql create mode 100644 scripts/optimization/queries_grouped_by_labels.sql create mode 100644 scripts/optimization/queries_grouped_by_script.sql create mode 100644 scripts/optimization/query_performance_insights.sql create mode 100644 scripts/optimization/run_all_scripts.sh create mode 100644 scripts/optimization/table_read_patterns.sql create mode 100644 scripts/optimization/table_read_patterns_by_query_hash.sql create mode 100644 scripts/optimization/tables_without_partitioning_or_clustering.sql create mode 100644 scripts/optimization/viewable_queries_grouped_by_hash.sql create mode 100644 scripts/optimization/views_with_nonoptimal_join_condition.sql diff --git a/scripts/README.md b/scripts/README.md index 9a3b948b8..27cef9c9b 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -2,4 +2,5 @@ Common script utilities which can help automate common tasks within a BigQuery data warehouse. -* [billing](/scripts/billing) - Example queries over the GCP billing export to help get you started using the standard data export format \ No newline at end of file +* [billing](/scripts/billing) - Example queries over the GCP billing export to help get you started using the standard data export format. +* [optimization](/scripts/optimization) - Scripts that identify tables and queries which require optimization. \ No newline at end of file diff --git a/scripts/optimization/README.md b/scripts/optimization/README.md new file mode 100644 index 000000000..3aff72c85 --- /dev/null +++ b/scripts/optimization/README.md @@ -0,0 +1,443 @@ +# Optimization Scripts + +This folder contains scripts that (when executed) create a dataset +named, `optimization_workshop`, with several tables inside the dataset. \ +These tables are populated with information to help you optimize your BigQuery +tables, views, and queries. + +Run all the scripts within this folder using the following commands: + +```bash +gcloud auth login && +bash run_all_scripts.sh +``` + +The scripts are described in more detail in the following sections. + +--- + +# Project Analysis + +Project level analysis enables us to understand key metrics such as slot_time, +bytes_scanned, bytes_shuffled and bytes_spilled on a daily basis within a +project. The metrics are examined as averages, medians and p80s. This enables us +to understand at a high level what jobs within a project consume 80% of the time +and 50% of the time daily. + +
🔍 Daily project metrics + +## Daily project metrics + +The [daily_project_analysis.sql](daily_project_analysis.sql) script creates a +table called, +`daily_project_analysis` of daily slot consumption metrics (for a 30day period) +for all your projects. + +### Examples of querying script results + +* Top 100 tables with the highest slot consumption + + ```sql + SELECT * + FROM optimization_workshop.daily_project_analysis + ORDER BY total_slot_ms DESC + LIMIT 100 + ``` + +
+ +# Table Analysis + +
🔍 BigQuery Clustering/Partitioning Recommender Tool + +## BigQuery Clustering/Partitioning Recommender Tool + +The BigQuery partitioning and clustering recommender analyzes workflows on your +BigQuery tables and offers recommendations to better optimize your workflows and +query costs using either table partitioning or table clustering. The recommender +uses your BigQuery's workload execution data from the past 30 days to analyze +each BigQuery table for suboptimal partitioning and clustering configurations. + +> [!IMPORTANT] +> Before you can view partition and cluster recommendations, you +> need to [enable the Recommender API](https://cloud.google.com/recommender/docs/enabling) +> as shown in the following sections. + +### Enable using gcloud + +```bash +# The following script retrieves all distinct projects from the JOBS_BY_ORGANIZATION view +# and then enables the recommender API for each project. +projects=$( + bq query \ + --nouse_legacy_sql \ + --format=csv \ + "SELECT DISTINCT project_id FROM \`region-us\`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION" \ + | sed 1d +); +for proj in $projects; do + gcloud services --project="${proj}" enable recommender.googleapis.com & +done +``` + +### Enable using Terraform + +```hcl +resource "google_project_service" "recommender_service" { + project = "your-project" + service = "recommender.googleapis.com" +} +``` + +### View your partition and cluster recommendations + +Once you've enabled the Recommender API, +you can [view your partition and cluster recommendations](https://cloud.google.com/bigquery/docs/view-partition-cluster-recommendations#view_recommendations) +in the Cloud Console or via the gcloud command-line tool. + +> [!NOTE] +> The most scalable method for viewing your partition and cluster +> recommendations is to export your recommendations to BigQuery. +> You can do this by creating a Data Transfer Service (DTS) job to export your +> recommendations to BigQuery. +> See [Exporting recommendations to BigQuery](https://cloud.google.com/recommender/docs/bq-export/export-recommendations-to-bq#create_a_data_transfer_for_recommendations) +> for more information. + +Once you set up the DTS job to export your recommendations to BigQuery, you can +run the following query to get the most recent recommendations for partitioning +and clustering your tables. + +```sql +CREATE TEMP FUNCTION extract_table(target_resources ARRAY) AS(( + SELECT ARRAY_AGG( + REGEXP_REPLACE(REGEXP_EXTRACT(target_resource, r'\/projects\/(.*?\/datasets\/.*?\/tables\/.*)'), "(/datasets/|/tables/)", ".") + ) + FROM UNNEST(target_resources) target_resource)[OFFSET(0)] +); +SELECT + MAX(last_refresh_time) AS latest_recommendation_time, + recommender_subtype AS recommendation, + MAX_BY(JSON_VALUE_ARRAY(COALESCE( + PARSE_JSON(recommendation_details).overview.partitionColumns, + PARSE_JSON(recommendation_details).overview.clusterColumns)), + last_refresh_time) AS columns_to_cluster_or_partition, + extract_table(target_resources) AS table, + bqutil.fn.table_url(extract_table(target_resources)) AS table_url, + JSON_VALUE(PARSE_JSON(recommendation_details).overview.partitionTimeUnit) AS partition_time_unit, +-- Replace the table below with your own table that you created when you +-- set up the DTS job to export your recommendations to BigQuery. +FROM YOUR_PROJECT.YOUR_DATASET.recommendations_export +WHERE recommender = "google.bigquery.table.PartitionClusterRecommender" +GROUP BY recommendation, table, partition_time_unit, table_url +``` + +
+ +
🔍 Tables with query read patterns + +## Tables with query read patterns + +The [table_read_patterns.sql](table_read_patterns.sql) script creates a table +named, `table_read_patterns`, that contains usage data to help you determine: + +* Which tables (when queried) are resulting in high slot consumption. +* Which tables are most frequently queried. + +### Examples of querying script results + +* Tables grouped by similar filter predicates + + ```sql + SELECT + table_id, + bqutil.fn.table_url(table_id) AS table_url, + (SELECT STRING_AGG(column ORDER BY COLUMN) FROM UNNEST(predicates)) column_list, + (SELECT STRING_AGG(operator ORDER BY COLUMN) FROM UNNEST(predicates)) operator_list, + (SELECT STRING_AGG(value ORDER BY COLUMN) FROM UNNEST(predicates)) value_list, + SUM(stage_slot_ms) AS total_slot_ms, + COUNT(DISTINCT DATE(creation_time)) as num_days_queried, + COUNT(*) AS num_occurrences, + COUNT(DISTINCT job_id) as job_count, + ARRAY_AGG(CONCAT(project_id,':us.',job_id) ORDER BY total_slot_ms LIMIT 10) AS job_id_array, + ARRAY_AGG(bqutil.fn.job_url(project_id || ':us.' || job_id)) AS job_url_array, + FROM optimization_workshop.table_read_patterns + GROUP BY 1,2,3,4,5; + ``` + +* Top 100 tables with the highest slot consumption + + ```sql + SELECT + table_id, + bqutil.fn.table_url(table_id) AS table_url, + SUM(stage_slot_ms) AS total_slot_ms, + COUNT(DISTINCT DATE(creation_time)) as num_days_queried, + COUNT(*) AS num_occurrences, + COUNT(DISTINCT job_id) as job_count, + FROM optimization_workshop.table_read_patterns + GROUP BY 1,2 + ORDER BY total_slot_ms DESC + LIMIT 100 + ``` + +* Top 100 most frequently queried tables + + ```sql + SELECT + table_id, + bqutil.fn.table_url(table_id) AS table_url, + SUM(stage_slot_ms) AS total_slot_ms, + COUNT(DISTINCT DATE(creation_time)) as num_days_queried, + COUNT(*) AS num_occurrences, + COUNT(DISTINCT job_id) as job_count, + FROM optimization_workshop.table_read_patterns + GROUP BY 1,2 + ORDER BY num_occurrences DESC + LIMIT 100 + ``` + +
+ +
🔍 Tables without partitioning or clustering + +## Tables without partitioning or clustering + +The [tables_without_partitioning_or_clustering.sql](tables_without_partitioning_or_clustering.sql) +script creates a table named, `tables_without_part_clust`, +that contains a list of tables which meet any of the following conditions: + +- not partitioned +- not clustered +- neither partitioned nor clustered + +### Examples of querying script results + +* Top 100 largest tables without partitioning or clustering + + ```sql + SELECT * + FROM optimization_workshop.tables_without_part_clust + ORDER BY logical_gigabytes DESC + LIMIT 100 + ``` + +
+ +
🔍 Actively read tables with partitioning and clustering information + +## Actively read tables with partitioning and clustering information + +> [!IMPORTANT] +> The [actively_read_tables_with_partitioning_clustering_info.sql](optimization/actively_read_tables_with_partitioning_clustering_info.sql) +> script depends on the `table_read_patterns` table so you must first run the +> [table_read_patterns.sql](optimization/table_read_patterns.sql) script. + +The [actively_read_tables_with_partitioning_clustering_info.sql](actively_read_tables_with_partitioning_clustering_info.sql) +script creates a table named, `actively_read_tables_with_part_clust_info` +that contains a list of actively read tables along with their partitioning and +clustering information. + +### Examples of querying script results + +* Top 100 largest **actively read** tables without partitioning or clustering + + ```sql + SELECT * + FROM optimization_workshop.actively_read_tables_with_part_clust_info + WHERE clustering_columns IS NULL OR partitioning_column IS NULL + ORDER BY logical_gigabytes DESC + LIMIT 100 + ``` + +
+ +
🔍 Tables receiving high quantity of daily DML statements + +## Tables receiving high quantity of daily DML statements + +The [frequent_daily_table_dml.sql](frequent_daily_table_dml.sql) script creates +a table named, `frequent_daily_table_dml`, that contains tables that have had +more than 24 daily DML statements run against them in the past 30 days. + +### Examples of querying script results + +* Top 100 tables with the most DML statements per table in a day + + ```sql + SELECT + table_id, + table_url, + ANY_VALUE(dml_execution_date HAVING MAX daily_dml_per_table) AS sample_dml_execution_date, + ANY_VALUE(job_urls[OFFSET(0)] HAVING MAX daily_dml_per_table) AS sample_dml_job_url, + MAX(daily_dml_per_table) max_daily_table_dml, + FROM optimization_workshop.frequent_daily_table_dml + GROUP BY table_id, table_url + ORDER BY max_daily_table_dml DESC + LIMIT 100; + ``` + +
+ +
🔍 Views with non-optimal JOIN conditions + +## Views with non-optimal JOIN conditions + +The [views_with_nonoptimal_join_condition.sql](views_with_nonoptimal_join_condition.sql) +script creates a table named, `views_with_nonoptimal_join_condition`, that +contains views with JOINs where the JOIN condition is potentially non-optimal. + +
+ +# Query Analysis + +
🔍 Queries grouped by hash + +## Queries grouped by hash + +The [queries_grouped_by_hash.sql](queries_grouped_by_hash.sql) script creates a +table named, +`queries_grouped_by_hash`. This table groups queries by their normalized query +pattern, which ignores +comments, parameter values, UDFs, and literals in the query text. +This allows us to group queries that are logically the same, but +have different literals. The `queries_grouped_by_hash` table does not expose the +raw SQL text of the queries. + +The [viewable_queries_grouped_by_hash.sql](viewable_queries_grouped_by_hash.sql) +script creates a table named, +`viewable_queries_grouped_by_hash`. This table is similar to +the `queries_grouped_by_hash` table, but it +exposes the raw SQL text of the queries. +The `viewable_queries_grouped_by_hash.sql` script runs much slower +in execution than the `queries_grouped_by_hash.sql` script because it has to +loop over all projects and for each +project query the `INFORMATION_SCHEMA.JOBS_BY_PROJECT` view. + +For example, the following queries would be grouped together because the date +literal filters are ignored: + +```sql +SELECT * FROM my_table WHERE date = '2020-01-01'; +SELECT * FROM my_table WHERE date = '2020-01-02'; +SELECT * FROM my_table WHERE date = '2020-01-03'; +``` + +### Examples of querying script results + +* Top 100 queries with the highest bytes processed + + ```sql + SELECT * + FROM optimization_workshop.queries_grouped_by_hash + ORDER BY total_gigabytes_processed DESC + LIMIT 100 + ``` + +* Top 100 recurring queries with the highest slot hours consumed + + ```sql + SELECT * + FROM optimization_workshop.queries_grouped_by_hash + ORDER BY total_slot_hours * days_active * job_count DESC + LIMIT 100 + ``` + +
+ +
🔍 Queries grouped by script + +## Queries grouped by script + +The [queries_grouped_by_script.sql](queries_grouped_by_script.sql) script +creates a table named, +`queries_grouped_by_script`. This table groups queries by their parent job id +which means that any queries that +were executed as part of a script (multi-statement query) will be grouped +together. This table is helpful if you +want to see which particular scripts are most inefficient. + +### Examples of querying script results + +* Top 100 scripts with the highest bytes processed + + ```sql + SELECT * + FROM optimization_workshop.queries_grouped_by_script + ORDER BY total_gigabytes_processed DESC + LIMIT 100 + ``` + +* Top 100 scripts with the highest slot hours consumed + + ```sql + SELECT * + FROM optimization_workshop.queries_grouped_by_script + ORDER BY total_slot_hours DESC + LIMIT 100 + ``` + +* Top 100 scripts with the highest slot hours consumed that doesn't include + INFO_SCHEMA views + + ```sql + SELECT * + FROM optimization_workshop.queries_grouped_by_script + WHERE NOT EXISTS( + SELECT 1 + FROM UNNEST(referenced_tables) table + WHERE table LIKE "INFORMATION_SCHEMA%" + ) + ORDER BY total_slots DESC + LIMIT 100 + ``` + +
+ +
🔍 Hourly slot consumption by query hash + +## Hourly slot consumption by query hash + +The [hourly_slot_consumption_by_query_hash](hourly_slot_consumption_by_query_hash.sql) script +creates a table named, +`hourly_slot_consumption_by_query_hash`. This table groups queries by their query hash and then shows +for every hour timeslice how much of that hour's slots each query hash consumed. + +
+ +
🔍 Queries with performance insights + +## Queries with performance insights + +The [query_performance_insights.sql](query_performance_insights.sql) script +creates a table named, `query_performance_insights` retrieves all queries that +have had performance insights +generated for them in the past 30 days. + +### Examples of querying script results + +* Top 100 queries with most # of performance insights + + ```sql + SELECT + job_url, + (SELECT COUNT(1) + FROM UNNEST(performance_insights.stage_performance_standalone_insights) perf_insight + WHERE perf_insight.slot_contention + ) AS num_stages_with_slot_contention, + (SELECT COUNT(1) + FROM UNNEST(performance_insights.stage_performance_standalone_insights) perf_insight + WHERE perf_insight.insufficient_shuffle_quota + ) AS num_stages_with_insufficient_shuffle_quota, + (SELECT ARRAY_AGG(perf_insight.input_data_change.records_read_diff_percentage IGNORE NULLS) + FROM UNNEST(performance_insights.stage_performance_change_insights) perf_insight + ) AS records_read_diff_percentages + FROM optimization_workshop.query_performance_insights + ORDER BY ( + num_stages_with_slot_contention + + num_stages_with_insufficient_shuffle_quota + + ARRAY_LENGTH(records_read_diff_percentages) + ) DESC + LIMIT 100 + ``` + +
diff --git a/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql b/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql new file mode 100644 index 000000000..662194d8d --- /dev/null +++ b/scripts/optimization/actively_read_tables_with_partitioning_clustering_info.sql @@ -0,0 +1,121 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, actively_read_tables_with_part_clust_info, + * that contains a list of the most frequently read tables which are: + * - not partitioned + * - not clustered + * - neither partitioned nor clustered + */ + +DECLARE projects ARRAY DEFAULT ( + SELECT ARRAY_AGG(DISTINCT project_id) + FROM optimization_workshop.table_read_patterns +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info +( + project_id STRING, + dataset_id STRING, + table_id STRING, + total_slot_ms FLOAT64, + total_jobs INT64, + max_slot_ms_job_url STRING, + num_days_queried INT64, + predicate_columns STRING, + partitioning_column STRING, + clustering_columns STRING, + table_url STRING, + logical_gigabytes FlOAT64, + logical_terabytes FLOAT64 +); + +FOR p IN ( + SELECT project_id + FROM + UNNEST(projects) project_id +) +DO +BEGIN + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info + SELECT + rp.* EXCEPT(predicates), + ARRAY_TO_STRING(ARRAY_CONCAT_AGG(( + SELECT + ARRAY_AGG(predicate_column_counts.column || ':' || predicate_column_counts.cnt) + FROM ( + SELECT + STRUCT(predicate.column, COUNT(predicate.column) AS cnt) AS predicate_column_counts + FROM UNNEST(predicates) predicate + GROUP BY predicate.column + ))), ', ') AS predicate_columns, + partitioning_column, + clustering_columns, + bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url, + SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes, + SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes, + FROM + `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s + JOIN ( + SELECT + project_id, + dataset_id, + table_id, + ARRAY_CONCAT_AGG(predicates) AS predicates, + SUM(stage_slot_ms) AS total_slot_ms, + COUNT(DISTINCT job_id) AS total_jobs, + ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url, + COUNT(DISTINCT date) AS num_days_queried, + FROM optimization_workshop.table_read_patterns + GROUP BY + project_id, + dataset_id, + table_id + ) rp + ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id) + JOIN ( + SELECT + table_catalog, + table_schema, + table_name, + STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column, + STRING_AGG( + CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END + ORDER BY CLUSTERING_ORDINAL_POSITION + ) AS clustering_columns + FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS` + GROUP BY 1,2,3 + ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name) + GROUP BY + project_id, + dataset_id, + table_id, + table_url, + total_slot_ms, + total_jobs, + max_slot_ms_job_url, + num_days_queried, + partitioning_column, + clustering_columns, + total_slot_ms; + """, + p.project_id); +EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors +END; +END FOR; diff --git a/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql b/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql new file mode 100644 index 000000000..bd2fe1370 --- /dev/null +++ b/scripts/optimization/assured_workloads_compliant/actively_read_tables_with_partitioning_clustering_info.sql @@ -0,0 +1,103 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +DECLARE projects ARRAY DEFAULT ( + SELECT ARRAY_AGG(DISTINCT project_id) + FROM optimization_workshop.table_read_patterns +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.actively_read_tables_with_part_clust_info +( + project_id STRING, + dataset_id STRING, + table_id STRING, + total_slot_ms FLOAT64, + total_jobs INT64, + max_slot_ms_job_url STRING, + num_days_queried INT64, + partitioning_column STRING, + clustering_columns STRING, + table_url STRING, + logical_gigabytes FlOAT64, + logical_terabytes FLOAT64 +); + +FOR p IN ( + SELECT project_id + FROM + UNNEST(projects) project_id +) +DO +BEGIN + EXECUTE IMMEDIATE FORMAT(""" + INSERT INTO optimization_workshop.actively_read_tables_with_part_clust_info + SELECT + rp.*, + partitioning_column, + clustering_columns, + bqutil.fn.table_url(rp.project_id || '.' || rp.dataset_id || '.' || rp.table_id) AS table_url, + SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes, + SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes, + FROM + `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s + JOIN ( + SELECT + project_id, + dataset_id, + table_id, + SUM(total_slot_ms) AS total_slot_ms, + COUNT(DISTINCT job_id) AS total_jobs, + ANY_VALUE(job_url HAVING MAX(total_slot_ms)) AS max_slot_ms_job_url, + COUNT(DISTINCT date) AS num_days_queried, + FROM optimization_workshop.table_read_patterns + GROUP BY + project_id, + dataset_id, + table_id + ) rp + ON (s.project_id = rp.project_id AND s.table_schema = rp.dataset_id AND s.table_name = rp.table_id) + JOIN ( + SELECT + table_catalog, + table_schema, + table_name, + STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column, + STRING_AGG( + CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END + ORDER BY CLUSTERING_ORDINAL_POSITION + ) AS clustering_columns + FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS` + GROUP BY 1,2,3 + ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name) + GROUP BY + project_id, + dataset_id, + table_id, + table_url, + total_slot_ms, + total_jobs, + max_slot_ms_job_url, + num_days_queried, + partitioning_column, + clustering_columns, + total_slot_ms; + """, + p.project_id); +EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors +END; +END FOR; \ No newline at end of file diff --git a/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql b/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql new file mode 100644 index 000000000..659ec4633 --- /dev/null +++ b/scripts/optimization/assured_workloads_compliant/table_read_patterns.sql @@ -0,0 +1,57 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The following script creates a table named, table_read_patterns, + * that contains a list of the most frequently read tables within the + * past 30 days. + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns +CLUSTER BY project_id, dataset_id, table_id AS +SELECT + DATE(creation_time) AS date, + jbo.project_id, + table.dataset_id, + table.table_id, + table.project_id || '.' || table.dataset_id || '.' || table.table_id AS full_table_id, + job_id, + bqutil.fn.job_url(jbo.project_id || ':us.' || job_id) AS job_url, + parent_job_id, + bqutil.fn.job_url(jbo.project_id || ':us.' || parent_job_id) AS parent_job_url, + reservation_id, + total_bytes_billed, + total_slot_ms, + creation_time, + start_time, + end_time +FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo, +UNNEST(referenced_tables) table +WHERE + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + AND ( + table.project_id||table.dataset_id||table.table_id + <> destination_table.project_id||destination_table.dataset_id||destination_table.table_id + ) + AND job_type = 'QUERY' + AND statement_type != 'SCRIPT' + AND NOT cache_hit + AND error_result IS NULL; diff --git a/scripts/optimization/daily_project_analysis.sql b/scripts/optimization/daily_project_analysis.sql new file mode 100644 index 000000000..4b30a848b --- /dev/null +++ b/scripts/optimization/daily_project_analysis.sql @@ -0,0 +1,94 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table that contains daily slot consumption information + * about BigQuery jobs for a given set of projects within the past 30 days. + * + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.daily_project_analysis AS +SELECT + day, + project_id, + COUNT(*) AS job_count, + AVG(total_slot_ms)/1000 AS avg_total_slot_secs, + MAX(median_total_slot_ms)/1000 AS median_total_slot_secs, + MAX(p80_total_slot_ms)/1000 AS p80_total_slot_secs, + SUM(total_slot_ms)/1000/60/60 AS total_slot_hours, + AVG(time_secs) AS avg_time_secs, + MAX(median_time_secs) AS median_time_secs, + SUM(time_secs)/60/60 AS total_time_hours, + MAX(p80_time_secs) AS p80_time_secs, + AVG(bytes_scanned)/POW(1024,3) AS avg_gb_scanned, + MAX(p80_bytes_scanned)/POW(1024,3) AS p80_gb_scanned, + SUM(bytes_scanned)/POW(1024,4) AS total_tb_scanned, + AVG(bytes_shuffled)/POW(1024,3) AS avg_gb_shuffled, + MAX(p80_bytes_shuffled)/POW(1024,3) AS p80_gb_shuffled, + SUM(bytes_shuffled)/POW(1024,4) AS total_tb_shuffled, + AVG(bytes_spilled)/POW(1024,3) AS avg_gb_spilled, + MAX(p80_bytes_spilled)/POW(1024,3) AS p80_gb_spilled, + SUM(bytes_spilled)/POW(1024,4) AS total_tb_spilled, +FROM( + SELECT + day, + project_id, + total_slot_ms, + PERCENTILE_CONT(total_slot_ms, 0.5) + OVER (PARTITION BY day, project_id) AS median_total_slot_ms, + PERCENTILE_CONT(total_slot_ms, 0.8) + OVER (PARTITION BY day, project_id) AS p80_total_slot_ms, + time_secs, + PERCENTILE_CONT(time_secs, 0.5) + OVER (PARTITION BY day, project_id) AS median_time_secs, + PERCENTILE_CONT(time_secs, 0.8) + OVER (PARTITION BY day, project_id) AS p80_time_secs, + total_bytes_scanned bytes_scanned, + PERCENTILE_CONT(total_bytes_scanned, 0.8) + OVER (PARTITION BY day, project_id) AS p80_bytes_scanned, + bytes_shuffled, + PERCENTILE_CONT(bytes_shuffled, 0.8) + OVER (PARTITION BY day, project_id) AS p80_bytes_shuffled, + bytes_spilled, + PERCENTILE_CONT(bytes_spilled, 0.8) + OVER (PARTITION BY day, project_id) AS p80_bytes_spilled + FROM( + SELECT + DATE(jbo.creation_time) AS day, + project_id, + job_id, + total_slot_ms, + TIMESTAMP_DIFF(jbo.end_time,jbo.start_time, SECOND) AS time_secs, + total_bytes_billed total_bytes_scanned, + (SELECT SUM(stage.shuffle_output_bytes) FROM UNNEST(job_stages) stage) AS bytes_shuffled, + (SELECT SUM(stage.shuffle_output_bytes_spilled) FROM UNNEST(job_stages) stage) AS bytes_spilled + FROM + `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo + WHERE + DATE(jbo.creation_time) >= CURRENT_DATE - num_days_to_scan + -- Uncomment below to specify a list of projects to analyze + -- AND jbo.project_id IN () + AND jbo.job_type = 'QUERY' + AND jbo.end_time > jbo.start_time + AND jbo.error_result IS NULL + AND jbo.statement_type != 'SCRIPT' +)) +GROUP BY 1, 2; diff --git a/scripts/optimization/enable_recommender_api.sh b/scripts/optimization/enable_recommender_api.sh new file mode 100644 index 000000000..6aeca2742 --- /dev/null +++ b/scripts/optimization/enable_recommender_api.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The following script retrieves all distinct projects from the JOBS_BY_ORGANIZATION view +# and then enables the recommender API for each project. +# This is useful for when you have a large number of projects and you want to enable the +# recommender API for all of them. +projects=$( + bq query \ + --nouse_legacy_sql \ + --format=csv \ + "SELECT DISTINCT project_id FROM \`region-us\`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION" | + sed 1d +) +for proj in $projects; do + gcloud services --project="${proj}" enable recommender.googleapis.com & +done diff --git a/scripts/optimization/frequent_daily_table_dml.sql b/scripts/optimization/frequent_daily_table_dml.sql new file mode 100644 index 000000000..897424598 --- /dev/null +++ b/scripts/optimization/frequent_daily_table_dml.sql @@ -0,0 +1,52 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * The following script will return tables that have had > 24 DML statements + * run against in any one day within the past 30 days. + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.frequent_daily_table_dml AS +SELECT + destination_table.project_id, + destination_table.dataset_id, + destination_table.table_id, + EXTRACT(DATE FROM creation_time) AS dml_execution_date, + bqutil.fn.table_url(destination_table.project_id || '.' || destination_table.dataset_id || '.' || destination_table.table_id) AS table_url, + COUNT(1) AS daily_dml_per_table, + ARRAY_AGG(job_id) AS job_ids, + ARRAY_AGG(bqutil.fn.job_url(project_id || ':us.' || job_id) IGNORE NULLS) AS job_urls, + ARRAY_AGG(DISTINCT statement_type) AS statement_types, + SUM(SAFE_DIVIDE(total_bytes_processed, pow(2,30))) AS sum_total_gb_processed, + AVG(SAFE_DIVIDE(total_bytes_processed, pow(2,30))) AS avg_total_gb_processed, + SUM(total_slot_ms) AS sum_total_slot_ms, + AVG(total_slot_ms) AS avg_total_slot_ms, + SUM(SAFE_DIVIDE(total_slot_ms, (TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)))) AS sum_avg_slots, + AVG(SAFE_DIVIDE(total_slot_ms, (TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)))) AS avg_avg_slots, +FROM + `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION +WHERE + -- Look at the past 30 days of jobs + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + -- Only look at DML statements + AND statement_type IN ('INSERT', 'UPDATE', 'DELETE', 'MERGE') + GROUP BY 1,2,3,4,5 + HAVING daily_dml_per_table > 24; diff --git a/scripts/optimization/hourly_slot_consumption_by_query_hash.sql b/scripts/optimization/hourly_slot_consumption_by_query_hash.sql new file mode 100644 index 000000000..337e5e2b9 --- /dev/null +++ b/scripts/optimization/hourly_slot_consumption_by_query_hash.sql @@ -0,0 +1,128 @@ +DECLARE num_days_to_scan INT64 DEFAULT 30; +DECLARE my_reservation_id STRING DEFAULT "your_reservation_id"; + + +CREATE OR REPLACE TABLE `optimization_workshop.hourly_slot_consumption_by_query_hash` AS +SELECT * EXCEPT(jobs_per_hour), +-- Get the top 10 query patterns by slot hours +ARRAY( + SELECT AS STRUCT + -- since period is hourly, period_slot_hours/(1hr duration) = period_num_slots + ROUND(period_slot_hours, 2) AS num_period_slots, + ROUND(period_slot_hours / period_total_slot_hours * 100, 2) || "%" AS pct_of_total_period, + COALESCE(slot_hunger,0) AS slot_hunger, + ROUND(avg_period_estimated_runnable_units,2) AS avg_period_estimated_runnable_units, + ROUND(max_duration_minutes, 2) AS max_duration_minutes, + ROUND(max_p90_period_slots, 2) AS max_p90_period_slots, + ROUND(p90_avg_job_period_slots, 2) AS p90_avg_job_period_slots, + ROUND(p90_duration_minutes, 2) AS p90_duration_minutes, + query_hash, + num_days_query_hash_active, + labels, + num_jobs_per_period, + job_url, + parent_job_url, + FROM( + SELECT + query_hash, + ANY_VALUE(num_days_query_hash_active) AS num_days_query_hash_active, + ARRAY_AGG(DISTINCT FORMAT("%T", labels)) AS labels, + MAX(duration_ms)/1000/60 AS max_duration_minutes, + SUM(slot_hunger) AS slot_hunger, + AVG(avg_period_estimated_runnable_units) AS avg_period_estimated_runnable_units, + MAX(p90_job_period_slots) AS max_p90_period_slots, + ANY_VALUE(p90_avg_job_period_slots) AS p90_avg_job_period_slots, + ANY_VALUE(p90_duration_minutes) AS p90_duration_minutes, + SUM(period_slot_hours) AS period_slot_hours, + COUNT(1) AS num_jobs_per_period, + ANY_VALUE(job_url HAVING MAX duration_ms) AS job_url, + ANY_VALUE(parent_job_url HAVING MAX duration_ms) AS parent_job_url, + ANY_VALUE(user_email HAVING MAX duration_ms) AS user_email, + FROM ( + SELECT *, + PERCENTILE_CONT(duration_ms, 0.9) OVER( + PARTITION BY query_hash + )/1000/60 AS p90_duration_minutes, + PERCENTILE_CONT(avg_job_period_slots, 0.9) OVER( + PARTITION BY query_hash + ) AS p90_avg_job_period_slots, + FROM UNNEST(jobs_per_hour) + ) + GROUP BY 1 + ORDER BY period_slot_hours DESC + LIMIT 10 + ) +) AS top_job_patterns_per_hour +FROM( + SELECT + FORMAT_TIMESTAMP("%F %H", period_start_hour, "America/New_York") AS period_start_hour, + COUNT(DISTINCT job_id) AS num_jobs_in_period, + SUM(period_slot_ms)/1000/60/60 AS period_total_slot_hours, + MAX(p90_job_period_slots) AS max_p90_job_period_slots, + ARRAY_AGG(STRUCT( + period_slot_ms/1000/60/60 AS period_slot_hours, + p90_job_period_slots, + avg_job_period_slots, + duration_ms, + avg_period_estimated_runnable_units, + slot_hunger, + query_hash, + num_days_query_hash_active, + labels, + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url, + COALESCE(bqutil.fn.job_url(project_id || ':us.' || parent_job_id), '') AS parent_job_url, + user_email + )) AS jobs_per_hour, + FROM( + SELECT + TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York") AS period_start_hour, + project_id, + parent_job_id, + job_id, + user_email, + AVG(COALESCE(period_estimated_runnable_units,0)) AS avg_period_estimated_runnable_units, + SUM(IF(COALESCE(period_estimated_runnable_units,0) >= 1000,1,0)) AS slot_hunger, + SUM(period_slot_ms) AS period_slot_ms, + ANY_VALUE(p90_job_period_slots) AS p90_job_period_slots, + ANY_VALUE(avg_job_period_slots) AS avg_job_period_slots, + FROM( + SELECT *, + PERCENTILE_CONT(period_slot_ms/1000, 0.9) OVER( + PARTITION BY TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York"), job_id + ) AS p90_job_period_slots, + AVG(period_slot_ms/1000) OVER( + PARTITION BY TIMESTAMP_TRUNC(period_start, HOUR, "America/New_York"), job_id + ) AS avg_job_period_slots, + + FROM `region-us`.INFORMATION_SCHEMA.JOBS_TIMELINE_BY_ORGANIZATION + WHERE + DATE(job_creation_time) >= CURRENT_DATE - num_days_to_scan + AND reservation_id = my_reservation_id + AND state IN ('RUNNING','DONE') + AND statement_type != 'SCRIPT' + AND period_slot_ms > 0 + -- Uncomment below to ignore slots consumed by failed queries + -- AND error_result IS NULL + ) + GROUP BY 1,2,3,4,5 + ) + JOIN ( + SELECT + job_id, + query_info.query_hashes.normalized_literals AS query_hash, + labels, + TIMESTAMP_DIFF(end_time, start_time, MILLISECOND) AS duration_ms, + COUNT(DISTINCT DATE(creation_time)) OVER( + PARTITION BY query_info.query_hashes.normalized_literals + ) AS num_days_query_hash_active, + FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION + WHERE + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + AND reservation_id = my_reservation_id + AND statement_type != 'SCRIPT' + AND total_slot_ms > 0 + -- Uncomment below to ignore slots consumed by failed queries + -- AND error_result IS NULL + ) USING(job_id) + GROUP BY period_start_hour +); \ No newline at end of file diff --git a/scripts/optimization/queries_grouped_by_hash.sql b/scripts/optimization/queries_grouped_by_hash.sql new file mode 100644 index 000000000..e4bbfdce0 --- /dev/null +++ b/scripts/optimization/queries_grouped_by_hash.sql @@ -0,0 +1,84 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, top_bytes_scanning_queries_by_hash, + * which contains the top 200 most expensive queries by total bytes scanned + * within the past 30 days. + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + * Queries are grouped by their normalized query pattern, which ignores + * comments, parameter values, UDFs, and literals in the query text. + * This allows us to group queries that are logically the same, but + * have different literals. + * + * For example, the following queries would be grouped together: + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01' + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02' + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03' + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS ( + COALESCE(( + SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0)) + FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0) + + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0) +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_hash AS +SELECT + statement_type, + query_info.query_hashes.normalized_literals AS query_hash, + COUNT(DISTINCT DATE(start_time)) AS days_active, + ARRAY_AGG(DISTINCT project_id IGNORE NULLS) AS project_ids, + ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids, + SUM(num_stages_with_perf_insights(query_info)) AS num_stages_with_perf_insights, + COUNT(DISTINCT (project_id || ':us.' || job_id)) AS job_count, + ARRAY_AGG( + STRUCT( + bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url, + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url + ) + ORDER BY total_slot_ms + DESC LIMIT 10) AS top_10_job_urls, + ARRAY_AGG(DISTINCT user_email) AS user_emails, + SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed, + AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed, + SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours, + AVG(total_slot_ms) / (1000 * 60 * 60) AS avg_total_slot_hours_per_active_day, + AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds, + ARRAY_AGG(DISTINCT FORMAT("%T",labels)) AS labels, + SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots, + AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots, + -- query hashes will all have the same referenced tables so we can use ANY_VALUE below + ANY_VALUE(ARRAY( + SELECT + ref_table.project_id || '.' || + IF(STARTS_WITH(ref_table.dataset_id, '_'), 'TEMP', ref_table.dataset_id) + || '.' || ref_table.table_id + FROM UNNEST(referenced_tables) ref_table + )) AS referenced_tables, +FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION +WHERE + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + AND state = 'DONE' + AND error_result IS NULL + AND job_type = 'QUERY' + AND statement_type != 'SCRIPT' +GROUP BY statement_type, query_hash; diff --git a/scripts/optimization/queries_grouped_by_labels.sql b/scripts/optimization/queries_grouped_by_labels.sql new file mode 100644 index 000000000..77d5514cb --- /dev/null +++ b/scripts/optimization/queries_grouped_by_labels.sql @@ -0,0 +1,41 @@ +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE TEMP FUNCTION num_stages_with_perf_insights(query_info ANY TYPE) AS ( + COALESCE(( + SELECT SUM(IF(i.slot_contention, 1, 0) + IF(i.insufficient_shuffle_quota, 1, 0)) + FROM UNNEST(query_info.performance_insights.stage_performance_standalone_insights) i), 0) + + COALESCE(ARRAY_LENGTH(query_info.performance_insights.stage_performance_change_insights), 0) +); + +CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_labels AS +SELECT + FORMAT("%T", labels) AS labels, + COUNT(DISTINCT DATE(start_time)) AS days_active, + ARRAY_AGG(DISTINCT project_id IGNORE NULLS) AS project_ids, + ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids, + SUM(num_stages_with_perf_insights(query_info)) AS num_stages_with_perf_insights, + COUNT(DISTINCT (project_id || ':us.' || job_id)) AS job_count, + ARRAY_AGG( + STRUCT( + bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url, + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url + ) + ORDER BY total_slot_ms + DESC LIMIT 10) AS top_10_job_urls, + ARRAY_AGG(DISTINCT user_email) AS user_emails, + SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed, + AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed, + SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours, + SUM(total_slot_ms) / (1000 * 60 * 60) / COUNT(DISTINCT DATE(start_time)) AS avg_total_slot_hours_per_active_day, + AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds, + SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots, + AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots, +FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION +WHERE + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + AND state = 'DONE' + AND error_result IS NULL + AND job_type = 'QUERY' + AND statement_type != 'SCRIPT' + AND user_email LIKE '%gserviceaccount.com' +GROUP BY labels; \ No newline at end of file diff --git a/scripts/optimization/queries_grouped_by_script.sql b/scripts/optimization/queries_grouped_by_script.sql new file mode 100644 index 000000000..e7803cff8 --- /dev/null +++ b/scripts/optimization/queries_grouped_by_script.sql @@ -0,0 +1,77 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE TEMP FUNCTION job_stage_max_slots(job_stages ANY TYPE) AS (( + SELECT MAX(SAFE_DIVIDE(stage.slot_ms,stage.end_ms - stage.start_ms)) + FROM UNNEST(job_stages) stage +)); +CREATE TEMP FUNCTION total_bytes_shuffled(job_stages ANY TYPE) AS (( + SELECT SUM(stage.shuffle_output_bytes) + FROM UNNEST(job_stages) stage +)); +CREATE TEMP FUNCTION total_shuffle_bytes_spilled(job_stages ANY TYPE) AS (( + SELECT SUM(stage.shuffle_output_bytes_spilled) + FROM UNNEST(job_stages) stage +)); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.queries_grouped_by_script AS +SELECT * REPLACE(( + SELECT + ARRAY_AGG(DISTINCT table.project_id || "." || table.dataset_id || "." || table.table_id) + FROM UNNEST(referenced_tables) table + ) AS referenced_tables) +FROM( + SELECT + bqutil.fn.job_url( + parent.project_id || ':us.' || parent.job_id) AS job_url, + parent.user_email, + parent.start_time, + ANY_VALUE(parent.labels) AS labels, + SUM(COALESCE(SAFE_DIVIDE(child.total_slot_ms, TIMESTAMP_DIFF( + child.end_time, child.start_time, MILLISECOND)), 0)) AS total_slots, + SUM(COALESCE(child.total_slot_ms, 0)) AS total_slot_ms, + SUM(COALESCE(child.total_slot_ms / (1000 * 60 * 60), 0)) AS total_slot_hours, + SUM(COALESCE(child.total_bytes_processed, 0)) / POW(1024, 3) AS total_gigabytes_processed, + SUM(COALESCE(child.total_bytes_processed, 0)) / POW(1024, 4) AS total_terabytes_processed, + ARRAY_CONCAT_AGG(child.referenced_tables) AS referenced_tables, + ARRAY_AGG(STRUCT( + bqutil.fn.job_url( + child.project_id || ':us.' || child.job_id) AS job_url, + child.reservation_id AS reservation_id, + EXTRACT(DATE FROM child.creation_time) AS creation_date, + TIMESTAMP_DIFF(child.end_time, child.start_time, SECOND) AS job_duration_seconds, + child.total_slot_ms AS total_slot_ms, + SAFE_DIVIDE(child.total_slot_ms,TIMESTAMP_DIFF( + child.end_time, child.start_time, MILLISECOND)) AS job_avg_slots, + job_stage_max_slots(child.job_stages) AS job_stage_max_slots, + total_bytes_shuffled(child.job_stages) AS total_bytes_shuffled, + total_shuffle_bytes_spilled(child.job_stages) AS total_shuffle_bytes_spilled) + ORDER BY child.start_time + ) AS children_jobs_details + FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION AS parent + JOIN `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION AS child + ON parent.job_id = child.parent_job_id + WHERE + DATE(parent.creation_time) >= CURRENT_DATE - num_days_to_scan + AND parent.state = 'DONE' + AND parent.error_result IS NULL + GROUP BY job_url, user_email, start_time + HAVING + ARRAY_LENGTH(children_jobs_details) > 1 + AND total_slot_ms > 0); diff --git a/scripts/optimization/query_performance_insights.sql b/scripts/optimization/query_performance_insights.sql new file mode 100644 index 000000000..1e0218893 --- /dev/null +++ b/scripts/optimization/query_performance_insights.sql @@ -0,0 +1,51 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script retrieves the top 100 queries that have had performance insights + * generated for them in the past 30 days. + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.query_performance_insights AS +SELECT + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url, + query_info.performance_insights AS performance_insights +FROM + `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION +WHERE + DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + AND job_type = 'QUERY' + AND state = 'DONE' + AND error_result IS NULL + AND statement_type != 'SCRIPT' + AND EXISTS ( -- Only include queries which had performance insights + SELECT 1 + FROM UNNEST( + query_info.performance_insights.stage_performance_standalone_insights + ) + WHERE slot_contention OR insufficient_shuffle_quota + UNION ALL + SELECT 1 + FROM UNNEST( + query_info.performance_insights.stage_performance_change_insights + ) + WHERE input_data_change.records_read_diff_percentage IS NOT NULL + ); diff --git a/scripts/optimization/run_all_scripts.sh b/scripts/optimization/run_all_scripts.sh new file mode 100644 index 000000000..596dca24a --- /dev/null +++ b/scripts/optimization/run_all_scripts.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Exit immediately if a command exits with a non-zero status. +set -e +# Set the following flags for the bq command: +# --quiet: suppress status updates while jobs are running +# --nouse_legacy_sql: use standard SQL syntax +# --nouse_cache: do not use cached results +bq_flags="--quiet --nouse_legacy_sql --nouse_cache" + +# Run all the .sql files in the current directory in parallel, +# except for table_read_patterns.sql +# and actively_read_tables_with_partitioning_clustering_info.sql +# since they'll be run sequentially due to the depedency between them. +for f in *.sql; do + if [[ $f = "table_read_patterns.sql" || + $f = "actively_read_tables_with_partitioning_clustering_info.sql" ]]; then + # Skip this file, it's already been run + continue + fi + bq query ${bq_flags} < $f & +done + +# Run the table_read_patterns.sql file first because it's a dependency for +# actively_read_tables_with_partitioning_clustering_info.sql +bq query ${bq_flags} DEFAULT ( + SELECT ARRAY_AGG(DISTINCT project_id) + FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION + WHERE DATE(creation_time) >= CURRENT_DATE - num_days_to_scan +); + +/* +-- Uncomment this block if you just want to scan the top 1000 projects +-- by total bytes billed in the past 30 days. +DECLARE projects ARRAY DEFAULT ( + SELECT ARRAY_AGG(project_id) + FROM( + SELECT project_id + FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION + WHERE DATE(creation_time) >= CURRENT_DATE - num_days_to_scan + GROUP BY 1 + ORDER BY SUM(total_bytes_billed) DESC + LIMIT 1000 +)); +*/ + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns +( + date DATE, + project_id STRING, + dataset_id STRING, + table_id STRING, + full_table_id STRING, + job_id STRING, + job_url STRING, + parent_job_id STRING, + parent_job_url STRING, + reservation_id STRING, + total_bytes_billed INT64, + total_slot_ms INT64, + creation_time TIMESTAMP, + start_time TIMESTAMP, + end_time TIMESTAMP, + stage_name STRING, + stage_id INT64, + stage_slot_ms INT64, + total_job_read_slot_ms INT64, + records_read INT64, + records_written INT64, + shuffle_output_bytes INT64, + shuffle_output_bytes_spilled INT64, + parallel_inputs INT64, + read_ratio_avg FLOAT64, + read_ms_avg INT64, + wait_ratio_avg FLOAT64, + wait_ms_avg INT64, + compute_ratio_avg FLOAT64, + compute_ms_avg INT64, + write_ratio_avg FLOAT64, + write_ms_avg INT64, + predicates ARRAY> +) CLUSTER BY project_id, dataset_id, table_id; + +CREATE TEMP FUNCTION mapColumns(where_clause STRING, column_list ARRAY) +RETURNS STRING +LANGUAGE js AS r""" + if (!where_clause) { + return null; + } + column_list.forEach(function(col) { + const tokens = col.split(":"); + where_clause = where_clause.replaceAll(tokens[0].trim(), tokens[1]); + }); + return where_clause; +"""; + +FOR p IN ( + SELECT project_id + FROM + UNNEST(projects) project_id +) +DO +BEGIN +EXECUTE IMMEDIATE FORMAT(""" +INSERT INTO optimization_workshop.table_read_patterns + SELECT + DATE(creation_time) AS date, + project_id, + IF(ARRAY_LENGTH(SPLIT(table_id, '.'))=2, SPLIT(table_id, '.')[OFFSET(0)], SPLIT(table_id, '.')[OFFSET(1)]) AS dataset_id, + SPLIT(table_id, '.')[ORDINAL(ARRAY_LENGTH(SPLIT(table_id, '.')))] AS table_id, + IF(ARRAY_LENGTH(SPLIT(table_id, '.'))=2, project_id || '.' || table_id, table_id) AS full_table_id, + job_id, + bqutil.fn.job_url(project_id || ':us.' || job_id) AS job_url, + parent_job_id, + bqutil.fn.job_url(project_id || ':us.' || parent_job_id) AS parent_job_url, + reservation_id, + total_bytes_billed, + total_slot_ms, + creation_time, + start_time, + end_time, + stage_name, + stage_id, + stage_slot_ms, + total_job_read_slot_ms, + records_read, + records_written, + shuffle_output_bytes, + shuffle_output_bytes_spilled, + parallel_inputs, + read_ratio_avg, + read_ms_avg, + wait_ratio_avg, + wait_ms_avg, + compute_ratio_avg, + compute_ms_avg, + write_ratio_avg, + write_ms_avg, + ARRAY( + SELECT STRUCT( + REGEXP_EXTRACT(predicate, '^[[:word:]]+') AS operator, + REGEXP_EXTRACT(predicate, '[(]([[:word:]]+)') AS column, + REGEXP_EXTRACT(predicate, '[,](.+)[)]') AS value ) + FROM UNNEST(filters) AS predicate + ) AS predicates + FROM ( + SELECT *, + REGEXP_EXTRACT_ALL( + mapcolumns(where_clause, projection_list), + '[[:word:]]+[(][^()]*?[)]') AS filters + FROM ( + SELECT + jbp.project_id, + job_id, + parent_job_id, + reservation_id, + total_bytes_billed, + total_slot_ms, + creation_time, + start_time, + end_time, + js.name AS stage_name, + js.id AS stage_id, + SUM(js.slot_ms) OVER (PARTITION BY job_id) AS total_job_read_slot_ms, + js.slot_ms AS stage_slot_ms, + js.records_read, + js.records_written, + js.shuffle_output_bytes, + js.shuffle_output_bytes_spilled, + js.parallel_inputs, + js.read_ratio_avg, + js.read_ms_avg, + js.wait_ratio_avg, + js.wait_ms_avg, + js.compute_ratio_avg, + js.compute_ms_avg, + js.write_ratio_avg, + js.write_ms_avg, + SPLIT(js_steps.substeps[safe_OFFSET(0)], ',') AS projection_list, + REPLACE(js_steps.substeps[safe_OFFSET(1)],'FROM ', '') AS table_id, + js_steps.substeps[safe_OFFSET(2)] AS where_clause + FROM `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT jbp + JOIN UNNEST(job_stages) AS js + JOIN UNNEST(steps) AS js_steps + WHERE + DATE(creation_time) >= CURRENT_DATE - %i + AND js_steps.kind = 'READ' + AND jbp.job_type = 'QUERY' + AND jbp.statement_type != 'SCRIPT' + AND NOT cache_hit + AND error_result IS NULL + AND NOT EXISTS ( -- Exclude queries over INFORMATION_SCHEMA + SELECT 1 + FROM UNNEST(js_steps.substeps) AS substeps + WHERE substeps LIKE 'FROM %%.INFORMATION_SCHEMA.%%') + AND EXISTS ( -- Only include substeps with a FROM clause + SELECT 1 + FROM UNNEST(js_steps.substeps) AS substeps + WHERE substeps LIKE 'FROM %%.%%') +))""", p.project_id, num_days_to_scan); +EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors +END; +END FOR; diff --git a/scripts/optimization/table_read_patterns_by_query_hash.sql b/scripts/optimization/table_read_patterns_by_query_hash.sql new file mode 100644 index 000000000..dd69ec428 --- /dev/null +++ b/scripts/optimization/table_read_patterns_by_query_hash.sql @@ -0,0 +1,90 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, table_read_patterns_by_query_hash + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.table_read_patterns_by_query_hash +CLUSTER BY project_id, dataset_id, table_id AS +SELECT + project_id, + dataset_id, + table_id, + COUNT(query_pattern) AS num_query_patterns, + ARRAY_AGG( + STRUCT( + query_pattern, + top_10_slot_ms_jobs, + avg_slot_hours, + days_active, + job_count, + total_gigabytes_processed, + avg_gigabytes_processed, + total_slot_hours, + avg_total_slot_hours_per_active_day, + avg_job_duration_seconds, + total_slots, + avg_total_slots + ) + ORDER BY avg_slot_hours * days_active * job_count DESC LIMIT 10 + ) AS top_10_slot_ms_patterns, + MAX(days_active) AS max_days_active, + MAX(avg_slot_hours) AS max_avg_slot_hours_across_all_patterns, + MAX(days_active * avg_slot_hours) AS max_weighted_avg_slot_hours, +FROM( + SELECT + referenced_table.project_id, + referenced_table.dataset_id, + referenced_table.table_id, + query_info.query_hashes.normalized_literals AS query_pattern, + ARRAY_AGG(STRUCT( + bqutil.fn.job_url(jbo.project_id || ':us.' || job_id) AS job_url, + bqutil.fn.job_url(jbo.project_id || ':us.' || parent_job_id) AS parent_job_url + ) + ORDER BY total_slot_ms DESC LIMIT 10 + ) AS top_10_slot_ms_jobs, + COUNT(DISTINCT DATE(start_time)) AS days_active, + ARRAY_AGG(DISTINCT jbo.project_id IGNORE NULLS) AS project_ids, + ARRAY_AGG(DISTINCT reservation_id IGNORE NULLS) AS reservation_ids, + COUNT(DISTINCT job_Id) AS job_count, + ARRAY_AGG(DISTINCT user_email) AS user_emails, + SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed, + AVG(total_bytes_processed) / POW(1024, 3) AS avg_gigabytes_processed, + SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours, + AVG(total_slot_ms) / (1000 * 60 * 60) AS avg_slot_hours, + SUM(total_slot_ms) / (1000 * 60 * 60) / COUNT(DISTINCT DATE(start_time)) AS avg_total_slot_hours_per_active_day, + AVG(TIMESTAMP_DIFF(end_time, start_time, SECOND) ) AS avg_job_duration_seconds, + SUM(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS total_slots, + AVG(total_slot_ms / TIMESTAMP_DIFF(end_time, start_time, MILLISECOND)) AS avg_total_slots, + FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION jbo, UNNEST(referenced_tables) referenced_table + WHERE + DATE(start_time) >= CURRENT_DATE - num_days_to_scan + AND ( + referenced_table.project_id||referenced_table.dataset_id||referenced_table.table_id + <> destination_table.project_id||destination_table.dataset_id||destination_table.table_id + ) + AND state = 'DONE' + AND error_result IS NULL + AND job_type = 'QUERY' + AND statement_type != 'SCRIPT' + AND referenced_table.table_id NOT LIKE '%INFORMATION_SCHEMA%' + AND user_email LIKE '%gserviceaccount.com' + GROUP BY 1,2,3,4 +) GROUP BY 1,2,3; diff --git a/scripts/optimization/tables_without_partitioning_or_clustering.sql b/scripts/optimization/tables_without_partitioning_or_clustering.sql new file mode 100644 index 000000000..578309368 --- /dev/null +++ b/scripts/optimization/tables_without_partitioning_or_clustering.sql @@ -0,0 +1,99 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, tables_without_part_clust, + * that contains a list of the largest tables which are: + * - not partitioned + * - not clustered + * - neither partitioned nor clustered + */ + +DECLARE projects ARRAY DEFAULT ( + SELECT + ARRAY_AGG(project_id) + FROM( + SELECT project_id + FROM `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` + WHERE NOT deleted + GROUP BY 1 + ORDER BY SUM(total_logical_bytes) DESC + LIMIT 100 + ) +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.tables_without_part_clust +( + table_catalog STRING, + table_schema STRING, + table_name STRING, + table_url STRING, + partitioning_column STRING, + clustering_columns STRING, + logical_gigabytes FlOAT64, + logical_terabytes FLOAT64 +); + +FOR p IN ( + SELECT project_id + FROM + UNNEST(projects) project_id +) +DO +BEGIN + EXECUTE IMMEDIATE FORMAT( + """ + INSERT INTO + optimization_workshop.tables_without_part_clust + SELECT + s.table_catalog, + s.table_schema, + s.table_name, + bqutil.fn.table_url(s.table_catalog || '.' || s.table_schema || '.' || s.table_name) AS table_url, + partitioning_column, + clustering_columns, + SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,30))) AS logical_gigabytes, + SUM(SAFE_DIVIDE(s.total_logical_bytes, POW(2,40))) AS logical_terabytes, + FROM + `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` s + JOIN ( + SELECT + table_catalog, + table_schema, + table_name, + STRING_AGG(IF(IS_PARTITIONING_COLUMN="YES", column_name, CAST(NULL AS STRING))) AS partitioning_column, + STRING_AGG( + CASE WHEN clustering_ordinal_position IS NOT NULL THEN column_name END + ORDER BY CLUSTERING_ORDINAL_POSITION + ) AS clustering_columns + FROM `%s.region-us.INFORMATION_SCHEMA.COLUMNS` + GROUP BY 1,2,3 + ) c ON (s.project_id = c.table_catalog AND s.table_schema = c.table_schema AND s.table_name = c.table_name) + WHERE + clustering_columns IS NULL OR partitioning_column IS NULL + GROUP BY + s.table_catalog, + s.table_schema, + s.table_name, + table_url, + partitioning_column, + clustering_columns; + """, + p.project_id); +EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors +END; +END FOR; diff --git a/scripts/optimization/viewable_queries_grouped_by_hash.sql b/scripts/optimization/viewable_queries_grouped_by_hash.sql new file mode 100644 index 000000000..ffd275f91 --- /dev/null +++ b/scripts/optimization/viewable_queries_grouped_by_hash.sql @@ -0,0 +1,168 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, top_bytes_scanning_queries_by_hash, + * which contains the top 200 most expensive queries by total bytes scanned + * within the past 30 days. + * 30 days is the default timeframe, but you can change this by setting the + * num_days_to_scan variable to a different value. + * Queries are grouped by their normalized query pattern, which ignores + * comments, parameter values, UDFs, and literals in the query text. + * This allows us to group queries that are logically the same, but + * have different literals. + * + * For example, the following queries would be grouped together: + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-01' + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-02' + * SELECT * FROM `my-project.my_dataset.my_table` WHERE date = '2020-01-03' + */ + +DECLARE num_days_to_scan INT64 DEFAULT 30; + +DECLARE projects ARRAY DEFAULT ( + SELECT + ARRAY_AGG(project_id) + FROM( + SELECT project_id + FROM `region-us`.INFORMATION_SCHEMA.JOBS_BY_ORGANIZATION + WHERE DATE(creation_time) >= CURRENT_DATE - 30 + GROUP BY 1 + ORDER BY SUM(total_bytes_billed) DESC + LIMIT 100 + ) +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.viewable_queries_grouped_by_hash +( + Query_Hash STRING, + Query_Raw_Sample STRING, + Job_Origin STRING, + Ref_Tables STRING, + Days_Active INT64, + Job_Count INT64, + Avg_Job_Count_Active_Days INT64, + Project_Id STRING, + BQ_Region STRING, + Reservation_Id STRING, + Total_Gigabytes_Processed INT64, + Total_Gigabytes_Processed_Per_Job INT64, + Avg_Gigabytes_Processed INT64, + Total_Slot_Hours INT64, + Avg_Total_Slot_Hours_per_Active_Day INT64, + Avg_Job_Duration_Seconds INT64, + Any_Job_Ids ARRAY, + User_Emails STRING, + Labels STRING +); + +FOR p IN ( + SELECT project_id + FROM + UNNEST(projects) project_id +) +DO +BEGIN +EXECUTE IMMEDIATE FORMAT(""" +INSERT INTO optimization_workshop.viewable_queries_grouped_by_hash +SELECT + query_hash AS Query_Hash, + ANY_VALUE(query_raw) AS Query_Raw_Sample, + SPLIT(ANY_VALUE(job_ids)[OFFSET(0)], '_')[OFFSET(0)] AS Job_Origin, + Ref_Tables AS Ref_Tables, + COUNT(DISTINCT creation_dt) AS Days_Active, + SUM(job_count) AS Job_Count, + CAST(AVG(job_count) AS INT64) AS Avg_Job_Count_Active_Days, + Project_Id AS Project_Id, + 'us' AS BQ_Region, + Reservation_Id AS Reservation_Id, + CAST(SUM(total_gigabytes_processed) AS INT64) AS Total_Gigabytes_Processed, + CAST(SUM(total_gigabytes_processed)/sum(job_count) AS INT64) AS Total_Gigabytes_Processed_Per_Job, + CAST(AVG(total_gigabytes_processed) AS INT64) AS Avg_Gigabytes_Processed, + CAST(SUM(total_slot_hours_per_day) AS INT64) AS Total_Slot_Hours, + CAST(AVG(total_slot_hours_per_day) AS INT64) AS Avg_Total_Slot_Hours_per_Active_Day, + CAST(AVG(avg_job_duration_seconds) AS INT64) AS Avg_Job_Duration_Seconds, + ANY_VALUE(job_ids) AS Any_Job_Ids, + STRING_AGG(DISTINCT user_emails_unnest) AS User_Emails, + STRING_AGG(DISTINCT labels_concat) AS Labels +FROM ( + SELECT + query_hash, + ANY_VALUE(query_raw) AS query_raw, + ref_tables AS ref_tables, + creation_dt AS creation_dt, + project_id AS project_id, + reservation_id AS reservation_id, + COUNT(*) AS job_count, + ARRAY_AGG(job_id ORDER BY total_slot_ms DESC LIMIT 10) AS job_ids, + SUM(total_slot_ms) / (1000 * 60 * 60) AS total_slot_hours_per_day, + SUM(total_bytes_processed) / POW(1024, 3) AS total_gigabytes_processed, + AVG(job_duration_seconds) AS avg_job_duration_seconds, + ARRAY_AGG(DISTINCT user_email) AS user_emails, + STRING_AGG(DISTINCT labels_concat) AS labels_concat + FROM ( + SELECT + query_info.query_hashes.normalized_literals AS query_hash, + query AS query_raw, + DATE(jbp.creation_time) AS creation_dt, + jbp.project_id AS project_id, + jbp.reservation_id AS reservation_id, + jbp.job_id AS job_id, + jbp.total_bytes_processed AS total_bytes_processed, + jbp.total_slot_ms AS total_slot_ms, + jbp.total_slot_ms / TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, MILLISECOND) AS slots, + TIMESTAMP_DIFF(jbp.end_time, jbp.start_time, SECOND) AS job_duration_seconds, + user_email, + STRING_AGG(ref_tables.project_id || '.' || + IF + (STARTS_WITH(ref_tables.dataset_id, '_'), + 'TEMP', + ref_tables.dataset_id) || '.' || ref_tables.table_id + ORDER BY + ref_tables.project_id || '.' || + IF + (STARTS_WITH(ref_tables.dataset_id, '_'), + 'TEMP', + ref_tables.dataset_id) || '.' || ref_tables.table_id) AS ref_tables, + FORMAT("%%T", ARRAY_CONCAT_AGG(labels)) AS labels_concat + FROM + `%s.region-us`.INFORMATION_SCHEMA.JOBS_BY_PROJECT as jbp + JOIN + UNNEST(referenced_tables) ref_tables + WHERE + DATE(jbp.creation_time) >= CURRENT_DATE - %i + AND jbp.end_time > jbp.start_time + AND jbp.error_result IS NULL + AND jbp.job_type = 'QUERY' + AND jbp.statement_type != 'SCRIPT' + AND ref_tables.table_id not like '%%INFORMATION_SCHEMA%%' + GROUP BY 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + ) + GROUP BY 1, 3, 4, 5, 6) +JOIN + UNNEST(user_emails) as user_emails_unnest +GROUP BY + Query_Hash, + Ref_Tables, + Project_Id, + BQ_Region, + Reservation_Id; +""", +p.project_id, num_days_to_scan); +EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors +END; +END FOR; diff --git a/scripts/optimization/views_with_nonoptimal_join_condition.sql b/scripts/optimization/views_with_nonoptimal_join_condition.sql new file mode 100644 index 000000000..0ee15d0ec --- /dev/null +++ b/scripts/optimization/views_with_nonoptimal_join_condition.sql @@ -0,0 +1,70 @@ +/* + * Copyright 2023 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * This script creates a table named, views_with_nonoptimal_join_condition, + * that contains a list of views with non-optimal join conditions. + */ + +DECLARE projects ARRAY DEFAULT ( + SELECT ARRAY_AGG(DISTINCT project_id) + FROM `region-us.INFORMATION_SCHEMA.TABLE_STORAGE_BY_ORGANIZATION` + WHERE NOT deleted +); + +CREATE SCHEMA IF NOT EXISTS optimization_workshop; +CREATE OR REPLACE TABLE optimization_workshop.views_with_nonoptimal_join_condition +( + view_name STRING, + view_url STRING, + view_definition STRING, + join_conditions ARRAY +); + +CREATE TEMP FUNCTION extract_nonoptimal_join_conditions(view_definition STRING) AS( + ARRAY_CONCAT( + REGEXP_EXTRACT_ALL( + REGEXP_REPLACE(UPPER(view_definition), r"\sON\s", "\nON "), + r"\nON\s+[A-Z_]+?\([^=]*?=[^=]*" + ), + REGEXP_EXTRACT_ALL( + REGEXP_REPLACE(UPPER(view_definition), r"\sON\s", "\nON "), + r"\nON\s+[^=]*?=\s*[A-Z_]+?\([^=]*" +))); + +FOR p IN ( + SELECT project_id + FROM + UNNEST(projects) project_id +) +DO +BEGIN + EXECUTE IMMEDIATE FORMAT(r""" + INSERT INTO optimization_workshop.views_with_nonoptimal_join_condition + SELECT + table_name AS view_name, + bqutil.fn.table_url(table_catalog || '.' || table_schema || '.' || table_name) AS view_url, + view_definition, + extract_nonoptimal_join_conditions(view_definition) AS join_conditions + FROM + `%s.region-us.INFORMATION_SCHEMA.VIEWS` + WHERE + ARRAY_LENGTH(extract_nonoptimal_join_conditions(view_definition)) >= 1 + """, + p.project_id); +EXCEPTION WHEN ERROR THEN SELECT @@error.message; --ignore errors +END; +END FOR;