대체적으로 연습 및 실무에서 윈도우 함수 관련 쿼리문을 많이 작성해봤던터라, 강의 이해가 잘되었던 것 같다.

 

FIRST VALUE, LAST VALUE, QUALIFY 연습문제 등을 통해서 몰랐던 새로운 함수들을 알게되었고

특히 QUALIFY 함수는 유용하게 쓰일 수 있을 것 같다.!!!

 

 

 

탐색함수 연습문제 1~3


문제 1) user들의 다음 접속 월과 다다음 접속 월을 구하는 쿼리를 작성해주세요.

LEAD는 반드시 정렬이 먼저 되어야 함 -> ORDER BY 추가

SELECT *,
       LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead_visit_month,
       LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead2_visit_month



FROM advanced.analytics_function_01


문제 2) user들의 다음 접속 월과 다다음 접속 월, 이전 접속 월을 구하는 쿼리를 
작성해주세요.


SELECT *,
       LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month,
       LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after2_visit_month,
       LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS before_visit_month

ORDER BY user_id, visit_month       
FROM advanced.analytics_function_01


문제 3) 유저가 접속 했을 때, 다음 접속까지의 간격


SELECT *,
       after_visit_month - visit_month AS diff

FROM
(       
    SELECT *,
    LEAD(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month

    FROM advanced.analytics_function_01
)

 

 

 

 

FIRST VALUE -> 첫번째 값 구하기 (NULL 값 포함) \, NULL 값을 제외하고 싶으면, OVER 앞 IGNORE NULLS 추가
LAST VALUE -> 마지막 값 구하기 (NULL 값 포함), NULL 값을 제외하고 싶으면, OVER 앞 IGNORE NULLS 추가


집계함수는 NULL을 제외 후 연산, COUNT * 제외


* 파티션이 없을 때
프레임 범위 기본값
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING


* 파티션이 있을 때
프레임 범위 기본값
파티션 내에서 현재 로우 까지가 기본값

---------------------------------------------------------------------

---------------------------------------------------------------------


Frame 설정하는 2가지 방법

1) ROWS
- 물리적인 행 수를 기준으로 경계를 지정
- 이전 행, 이후 3개의 행
- ROWS Frame를 더 많이 사용

2) RANGE
- 논리적인 값의 범위를 기준으로 지정
- 값의 3일 전, 3일 후




EX) 

AVG(col) OVER (PARTITION BY product_type ORDER BY timestamp

ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
Frame의 시작과 끝 지점 명시하기
- PRECEDING : 현재 행 기준으로 이전 행
- CURRENT ROW : 현재 행
- FOLLOWING : 현재 행 기준으로 이후 행
- UNBOUNDED : 처음부터 또는 끝까지(사전적 의미 : 묶이지 않고 제한되지 않음)




AVG(col) OVER (PARTITION BY product_type ORDER BY timestamp
ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING)
 = 현재 행과 그 앞뒤 한 행씩을 포함해서 평균

AVG(col) OVER (PARTITION BY product_type ORDER BY timestamp
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
 = 파티션의 처음부터 현재 행을 포함한 평균

 

 

 

 

FRAME 함수 연습문제

-- amount_total: 전체 SUM
-- cumulative_sum : row 시점에 누적 SUM
-- cumulative_sum_by_user : row 시점에 유저별 누적 SUM
-- last_5_orders_avg_amount : order_id 기준으로 정렬하고, 직전 5개 주문의 평균 amount
-- 집계분석함수() OVER(PARTITION BY ~~~ ORDER BY ~~~ ROWS BETWEEN A AND B)



SELECT *,
       SUM(amount) OVER() AS amount_total,
-- OVER()에 아무것도 들어가지 않을 수 있음
       SUM(amount) OVER(ORDER BY order_id) AS cumulative_sum,
       SUM(amount) OVER(PARTITION BY user_id ORDER BY order_id) AS cumulative_sum_by_user,
       AVG(amount) OVER(ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING)
       

FROM advanced.orders      
ORDER BY user_id, order_date

 

 

 

 

QUALIFY 연습문제


WHERE 대신 QUALIFY를 사용하면 윈도우 함수의 결과에 대해 필터링할 수 있음
WHERE과 같이 사용하는 경우엔 WHERE 아래에 작성하면 됨

 SELECT
 order_id,
 order_date,
 user_id,
 amount,
 SUM(amount) OVER (PARTITION BY user_id) AS amount_total
 FROM advanced.orders
 WHERE 1=1
 QUALIFY amount_total >= 500

 

 

 

 

 

윈도우 함수 연습문제


문제 1

SELECT user,
       team,
       query_date,
       COUNT(query_date) OVER (PARTITION BY user) AS total_query_cnt
FROM advanced.query_logs
ORDER BY user, query_date




문제 2 

WITH base AS (
    SELECT 
          EXTRACT(week FROM query_date) AS week_number,
          user,
          team,
          COUNT(query_date) AS total_query_cnt
  FROM advanced.query_logs
  GROUP BY week_number, user, team
)

SELECT 
      week_number,
      team,
      user,
      total_query_cnt,
      RANK() OVER (PARTITION BY team ORDER BY total_query_cnt DESC) AS ranking_in_team
FROM base
QUALIFY ranking_in_team = 1
ORDER BY week_number, team




문제 3

WITH base AS (
    SELECT 
          EXTRACT(week FROM query_date) AS week_number,
          user,
          team,
          COUNT(query_date) AS query_cnt
  FROM advanced.query_logs
  GROUP BY week_number, user, team
)

SELECT 
      user,
      team,
      week_number,
      query_cnt,
      LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt
FROM base 
ORDER BY user, week_number




문제 4

SELECT 
      user,
      team,
      query_date,
      query_count,
      SUM(query_count) OVER (PARTITION BY user ORDER BY query_date,
      ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_count
FROM (
      SELECT 
          user, 
          team,
          query_date,
          COUNT(query_date) AS query_count
  FROM advanced.query_logs
  GROUP BY 1,2,3
)
ORDER BY user, query_date




문제 5

WITH raw_data AS (
     SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL
     SELECT DATE '2024-05-02', 13 UNION ALL
     SELECT DATE '2024-05-03', NULL UNION ALL
     SELECT DATE '2024-05-04', 16 UNION ALL
     SELECT DATE '2024-05-05', NULL UNION ALL
     SELECT DATE '2024-05-06', 18 UNION ALL
     SELECT DATE '2024-05-07', 20 UNION ALL
     SELECT DATE '2024-05-08', NULL UNION ALL
     SELECT DATE '2024-05-09', 13 UNION ALL
     SELECT DATE '2024-05-10', 14 UNION ALL
     SELECT DATE '2024-05-11', NULL UNION ALL
     SELECT DATE '2024-05-12', NULL
)

SELECT 
      date,
      LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders
FROM raw_data




문제 6


WITH raw_data AS (
     SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL
     SELECT DATE '2024-05-02', 13 UNION ALL
     SELECT DATE '2024-05-03', NULL UNION ALL
     SELECT DATE '2024-05-04', 16 UNION ALL
     SELECT DATE '2024-05-05', NULL UNION ALL
     SELECT DATE '2024-05-06', 18 UNION ALL
     SELECT DATE '2024-05-07', 20 UNION ALL
     SELECT DATE '2024-05-08', NULL UNION ALL
     SELECT DATE '2024-05-09', 13 UNION ALL
     SELECT DATE '2024-05-10', 14 UNION ALL
     SELECT DATE '2024-05-11', NULL UNION ALL
     SELECT DATE '2024-05-12', NULL
)

SELECT 
      date, 
      number_of_orders,
      AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg
FROM (
      SELECT 
            date,
            LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders
      FROM raw_data
)




문제 7


WITH base AS (
  SELECT 
    event_date,
    event_timestamp,
    DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul') AS event_datetime,
    event_name,
    user_id,
    user_pseudo_id,
    LAG(DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul')) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS before_event_datetime
  FROM advanced.app_logs
  WHERE 1=1
  AND event_date = '2022-08-18'
)

SELECT
    *,
    DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff,
    CASE WHEN 
      DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) IS NULL
          OR DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 
      ELSE 0 
    END AS session_start,
    SUM(CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END)
    OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) + 1 AS session_temp
FROM base

 

  1. STRUCT 과 UNNEST 처음 접해보는 내용이라, 복습 필요.
  2. PIVOT 내용 중 ANY_VALUE는 데이터 양이 많고, 어떤 데이터들이 어떤 특성을 가지고 담겨있는지 정확하게 모른다면 활용하면 위험하겠다는 생각이 들었음.

 

Q1. STRUCT, UNNEST

1. array_exercises 테이블에서 각 영화(title)별로 장르(genres)를 UNNEST해서 보여주세요.



SELECT title,
      --  genres,
       genre
       

FROM advanced.array_exercises AS ae
CROSS JOIN UNNEST(genres) AS genre

-- genres는 평면화가 된 데이터를 의미
-- genres가 지금 배열
-- ARRAY : 같은 타입의 여러 데이터를 저장하고 싶을 때
-- ARRAY를 풀때 Flattten(평면화) -> UNNEST
-- UNNEST릃 할 때는 CROSS JOIN + UNNEST(ARRAY_COLUMN) 컬럼 명시









2) array_exercises 테이블에서 각 영화(title)q별로 배우(actor)와 배역(character)을 보여주세요. 배우와 배역은 별도의 컬럼으로 나와야 함.



SELECT title,
      --  actors

-- actor에 직접 접근하면 어떨까 -> 새로운 컬럼으로 가능하나, 매번 SAFE_OFFSET을 지정해야 함
-- actors = [STRUCT(STRING,STRING)]

actors[SAFE_OFFSET(0)].actor AS first_actor,
actors[SAFE_OFFSET(0)].actor AS first_character,
actors[SAFE_OFFSET(1)].actor AS second_actor,
actors[SAFE_OFFSET(1)].actor AS second_character

-- 배열에 직접 접근이 아닌 UNNEST로 풀어야 편리할 듯

FROM advanced.array_exercises as ae



---------------------------------------------------------------
---------------------------------------------------------------




SELECT title,
       actor.actor,
       actor.character

FROM advanced.array_exercises as ae
CROSS JOIN UNNEST(actors) AS actor
-- actors가 배열










3) array_exercises 테이블에서 각 영화(title) 별로 배우(actor), 배역(character), 장르(genre)를 출력. 한 row에 배우, 배역, 장르가 모두 표시되어야 함.




SELECT title,
      --  actors, #ARRAY<STRUCT(STRING, STRING)>
      actor.actor as actor,
      actor.character as character,
      -- genres # ARRAY<STRING>
      genre

FROM advanced.array_exercises
CROSS JOIN UNNEST(actors) AS actor
CROSS JOIN UNNEST(genres) AS genre

 

 

 

Q2. PIVOT

1-1)
1) orders 테이블에서 유저(user_id)별로 주문 금액(amount)의 합계를 PIVOT해주세요. 
날짜(order_date)를 행(Row)으로, user_id를 열(Column)으로 만들어야 합니다

SELECT order_date,
       IF(user_id = 1, amount , NULL) AS user_1,
       IF(user_id = 2, amount , NULL) AS user_2,
       IF(user_id = 3, amount , NULL) AS user_3

FROM(
  SELECT order_date,
        user_id,
        amount       

  FROM advanced.orders
  GROUP BY order_date, user_id, amount
  ORDER BY order_date
)
---------------------------------------------------------------
---------------------------------------------------------------

1-2)
SELECT order_date,
       MAX(IF(user_id = 1, amount , NULL)) AS user_1,
       MAX(IF(user_id = 2, amount , NULL)) AS user_2,
       MAX(IF(user_id = 3, amount , NULL)) AS user_3

FROM(
  SELECT order_date,
        user_id,
        amount       

  FROM advanced.orders
  GROUP BY order_date, user_id, amount
  ORDER BY order_date
)

GROUP BY order_date
ORDER BY order_date




---------------------------------------------------------------
---------------------------------------------------------------


2)  orders 테이블에서 날짜(order_date)별로 유저들의 주문 금액(amount)의 합계를 PIVOT해주세요. user_id를 행(Row)으로, order_date를 열(Column)으로 만들어야 합니다.

SELECT order_date,
       SUM(IF(user_id = 1, amount , NULL)) AS user_1,
       SUM(IF(user_id = 2, amount , NULL)) AS user_2,
       SUM(IF(user_id = 3, amount , NULL)) AS user_3
FROM advanced.orders
GROUP BY order_date       
ORDER BY order_date



backtick 활용
any value는 어디에 활용할 수 있을지? -> 데이터는 믿을수 없기에 일부 데이터만 보고 사용 판단하기엔 위험할 것 같음.

SELECT user_id,
       SUM(IF(order_date = "2023-05-01", amount, 0)) AS `2023-05-01`,
       SUM(IF(order_date = "2023-05-02", amount, 0)) AS `2023-05-02`,
       SUM(IF(order_date = "2023-05-03", amount, 0)) AS `2023-05-03`,
       SUM(IF(order_date = "2023-05-04", amount, 0)) AS `2023-05-04`,
       SUM(IF(order_date = "2023-05-05", amount, 0)) AS `2023-05-05`,


FROM advanced.orders
GROUP BY user_id
ORDER BY user_id



---------------------------------------------------------------
---------------------------------------------------------------

 
3) orders 테이블에서 사용자(user_id)별, 날짜(order_date)별로 주문이 있다면 1, 없다면 0으로 PIVOT 해주세요. user_id를 행(Row)으로, order_date를 열(Column)로 만들고 주문을 많이 해도 1로 처리합니다

3-1) 주문 여부 1,0

SELECT user_id,
       SUM(IF(order_date = "2023-05-01", 1, 0)) AS `2023-05-01`,
       SUM(IF(order_date = "2023-05-02", 1, 0)) AS `2023-05-02`,
       SUM(IF(order_date = "2023-05-03", 1, 0)) AS `2023-05-03`,
       SUM(IF(order_date = "2023-05-04", 1, 0)) AS `2023-05-04`,
       SUM(IF(order_date = "2023-05-05", 1, 0)) AS `2023-05-05`,


FROM advanced.orders
GROUP BY user_id
ORDER BY user_id



3-2) 횟수


SELECT user_id,
       SUM(IF(order_date = "2023-05-01", 1, 0)) AS `2023-05-01`,
       SUM(IF(order_date = "2023-05-02", 1, 0)) AS `2023-05-02`,
       SUM(IF(order_date = "2023-05-03", 1, 0)) AS `2023-05-03`,
       SUM(IF(order_date = "2023-05-04", 1, 0)) AS `2023-05-04`,
       SUM(IF(order_date = "2023-05-05", 1, 0)) AS `2023-05-05`,


FROM advanced.orders
GROUP BY user_id
ORDER BY user_id





-- 앱 로그 PIVOT


WITH base AS(
    SELECT event_date,
          event_timestamp,
          event_name,
          user_id,
          user_pseudo_id,
          MAX(IF(param.key = "firebase_screen", param.value.string_value, NULL)) AS firebase_screen,
          MAX(IF(param.key = "food_id", param.value.int_value, NULL)) AS food_id,
          MAX(IF(param.key = "session_id", param.value.string_value, NULL)) AS session_id

    -- * EXCEPT(event_params)
          
    FROM advanced.app_logs
    CROSS JOIN UNNEST(event_params) AS param
    WHERE event_date = "2022-08-01"
    GROUP BY ALL

)

SELECT event_date,
       COUNT(user_id) AS user_cnt
FROM base
WHERE event_name = "click_cart" AND food_id = 1544
GROUP BY event_date

 

 

 

 

Q3. 퍼널

-- 이중 WITH 문
WITH BASE AS(
      SELECT event_date,
            event_timestamp,
            event_name,
            user_id,
            user_pseudo_id,
            platform,
            --  event_param

            MAX(IF(event_param.key = "firebase_screen", event_param.value.string_value, NULL )) AS firebase_screen,
            -- MAX(IF(event_param.key = "food_id", event_param.value.int_value, NULL )) AS food_id,
            MAX(IF(event_param.key = "session_id", event_param.value.string_value, NULL )) AS session_id
            

      FROM advanced.app_logs

      CROSS JOIN UNNEST(event_params) AS event_param

      WHERE event_date BETWEEN "2022-08-01" AND "2022-08-18"
      GROUP BY ALL
)


--event_name + screen (필요한 이벤트만 조건 걸어서 사용)
  ,filter_event_and_concat_event_and_screen AS(
  

  SELECT * EXCEPT(event_name, firebase_screen, event_timestamp),
        CONCAT(event_name, "-", firebase_screen) AS event_name_with_screen,
        DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime
  FROM BASE
  WHERE event_name IN ("screen_view","click_payment")
  )


--step_number + COUNT
--CASE WHEN 사용
SELECT event_date,
       event_name_with_screen,
       
       CASE WHEN event_name_with_screen = "screen_view-welcome" THEN 1
            WHEN event_name_with_screen = "screen_view-home" THEN 2
            WHEN event_name_with_screen = "screen_view-food_category" THEN 3
            WHEN event_name_with_screen = "screen_view-restaurant" THEN 4
            WHEN event_name_with_screen = "screen_view-cart" THEN 5
            WHEN event_name_with_screen = "click_payment-cart" THEN 6
       ELSE NULL
       END AS step_number,
       COUNT(DISTINCT user_pseudo_id) AS cnt
FROM filter_event_and_concat_event_and_screen
GROUP BY ALL
HAVING step_number IS NOT NULL
ORDER BY event_date

-- food_detail, search, search_result도 파악

 

 

+ Recent posts