-- 创建表结构 CREATE TABLE user_behavior ( id INT PRIMARY KEY, user_id INT, amount DECIMAL(10, 2), session_duration INT );
-- 插入测试数据 INSERT INTO user_behavior (id, user_id, amount, session_duration) VALUES (1, 101, 150.50, 300), -- Normal activity for User 101 (2, 101, 200.00, 450), -- Multiple transactions for User 101 (3, 102, NULL, NULL), -- User 102 exists but has missing data (4, 103, 50.00, 120), -- Normal activity for User 103 (5, 104, NULL, 60); -- User 104 has duration but no amount
-- COALESCE 函数 SELECT user_id, COALESCE(SUM(amount), 0) as total_spend, -- 消费总额特征 COALESCE(AVG(session_duration), 0) as avg_duration -- 平均停留特征 FROM user_behavior GROUP BY user_id;
SELECT c.Region, GROUP_CONCAT(DISTINCT cl.Language ORDER BY cl.Language) AS LanguagesSpoken, COUNT(DISTINCT cl.Language) As NumLanguages FROM country As c, countrylanguage as cl WHERE c.Code = cl.Countrycode GROUP BY c.Region HAVING NumLanguages < 10; -- HAVING过滤聚合结果
输出:
1 2 3 4 5 6 7
+---------------+-------------------+--------------+ | Region | LanguagesSpoken | NumLanguages | +---------------+-------------------+--------------+ | East Asia | Cantonese,Chinese | 2 | | North America | English,Spanish | 2 | +---------------+-------------------+--------------+ 2 rows in set (0.001 sec)
03. MySQL 8.4 新特性:窗口函数与JSON聚合增强
MySQL 8.4 LTS 中聚合能力在窗口函数框架和JSON处理上有了显著提升。
1 2 3 4 5 6 7 8
-- 生成JSON格式的区域语言报告 SELECT c.Region, JSON_ARRAYAGG( JSON_OBJECT('lang', cl.Language, 'isOfficial', cl.IsOfficial) ) AS lang_details FROM country c JOIN countrylanguage cl ON c.Code = cl.CountryCode GROUPBY c.Region;
输出:
1 2 3 4 5 6 7
+---------------+------------------------------------------------------------------------------------+ | Region | lang_details | +---------------+------------------------------------------------------------------------------------+ | East Asia | [{"lang": "Cantonese", "isOfficial": "F"}, {"lang": "Chinese", "isOfficial": "T"}] | | North America | [{"lang": "English", "isOfficial": "T"}, {"lang": "Spanish", "isOfficial": "F"}] | +---------------+------------------------------------------------------------------------------------+ 2 rows in set (0.001 sec)
MySQL 8.4优化:当窗口函数与GROUP BY结合时,优化器能更好地消除临时表
1 2 3 4 5
EXPLAIN FORMAT=TREE SELECT dept_id, AVG(salary) OVER (PARTITION BY dept_id) FROM employees GROUP BY dept_id;
-- 利用HeatWave列式存储加速聚合(相比InnoDB行式存储提升10-100倍) SELECT/*+ SET_VAR(rapid_execution_strategy=auto) */ user_segment, COUNT(*) as user_count, AVG(lifetime_value) as avg_ltv, MAX(last_purchase_date) as latest_purchase FROM customer_analytics GROUPBY user_segment;
-- 特征工程:实时聚合用户最近30天行为(OCI微服务架构) CREATETABLE user_features AS SELECT user_id, COUNT(DISTINCT session_id) as unique_sessions, -- 去重计数特征 SUM(CASEWHEN event_type ='click'THEN1ELSE0END) as total_clicks, AVG(CASEWHEN event_type ='purchase'THEN amount ELSENULLEND) as avg_purchase_amount, -- 使用GROUP_CONCAT生成行为序列(用于NLP模型) GROUP_CONCAT(DISTINCT event_type ORDERBY created_at DESC) as recent_behavior_sequence FROM user_events WHERE created_at > DATE_SUB(NOW(), INTERVAL30DAY) GROUPBY user_id;
-- 使用JSON_OBJECTAGG生成结构化特征向量(适合ML模型输入) SELECT user_id, JSON_OBJECTAGG(feature_name, feature_value) as feature_vector FROM ( SELECT user_id, 'click_rate'as feature_name, click_count/imp_count as feature_value FROM metrics UNIONALL SELECT user_id, 'conversion_rate', conversion_count/click_count FROM metrics ) sub GROUPBY user_id;
索引优化聚合性能:
1 2 3 4 5 6
-- 为GROUP BY字段创建索引 CREATE INDEX idx_region ON country(Region);
-- 覆盖索引优化聚合查询(避免回表) CREATE INDEX idx_covering ON employees(dept_id, salary, bonus); -- 使得SELECT dept_id, SUM(salary), AVG(bonus) FROM employees GROUP BY dept_id 完全使用索引