只包含查询语法, 不包含具体机制.
基本查询
/*
DISTINCT 只能用在第一个列名前
用 DISTINCT 时, NULL 也被视为一类数据
*/
SELECT DISTINCT column_name,
column_name * 2 AS "中文别名要双引号",
COUNT(DISTINCT column_name),
-- NULL 做运算会得到 NULL, 但聚合函数事先排除了 NULL
SUM(column_name), AVG(column_name)
FROM table_name a -- 可以省略 AS
JOIN tbl2 b ON a.col1 = b.col1 -- 有 JOIN 时最好指明哪个表的列
WHERE a.col3 = '字符串单引号'
AND col4 <> '标准 SQL 的不等号'
AND col5 IS NOT NULL -- NULL 不能用等式或不等式比较
/* 此外, 值为 NULL 的东西做逻辑判断时既不是真也不是假,
而是第三个值不确定 (三值逻辑) */
AND (str_col LIKE 'te%' OR str_col LIKE 'tra_n')
AND num_col BETWEEN 1 AND 10 -- 包含两端
AND col6 NOT IN (1, 2, 3)
AND col7 IN (SELECT col7 FROM tbl_3 c WHERE a.col2 = c.col2)
-- 有 GROUP BY 时只能 SELECT 那里聚合的列名或者聚合函数, 以及常数
SELECT col1 AS alis, MAX(another_col)
FROM ...
WHERE ...
/* 不能使用别名, 因为先于 SELECT 执行
默认排序是随机的 */
GROUP BY col1
/* 列名限制同 SELECT */
HAVING COUNT(*) = 2 -- 对 col1 进行分类后, 总行数为 2 的组
ORDER BY alis DESC -- 最后执行, 所以可以用别名
/* 也可以用 SELECT 中未包含的列
也可以用聚合函数 */
SELECT SUM(CASE WHEN col='a' THEN col2 ELSE 0 END),
-- 不用 PARTITION BY 则不分组直接排序
-- 这里 ORDER BY 只表示窗口函数的计算方式
RANK () OVER (PARTITION BY col1 ORDER BY col2) AS ranking,
SUM (col3) OVER (PARTITION BY col4), -- 类似累加
AVG (num_col) OVER (ORDER BY col5 ROWS 2 PRECEDING) -- 前两行和当前行的移动平均
-- 另外能用例如 ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING
FROM tab
ORDER BY ranking -- 排序
其他
- 合并查询 UNION (列数, 数据类型一致), ORDER BY 只能在最后使用一次. UNION ALL 不去重. INTERSECT, EXCEPT.
- LIMIT, TOP
- RANK, DENSE_RANK, ROW_NUMBER
练习
事先准备: 建立本地数据库
用 sqlite 创建本地数据库, 下面的查询用的也是 SQLite 语法.
import re
import sqlite3
import pandas as pd
def str_helper(iterable):
"""Note: Will have a trailing comma if there is only one entry"""
return str(tuple(iterable)).replace("'", '')
def insert_data(con, table_name, columns, data, from_scratch=True):
"""
Parameters
----------
con : sqlite3.Connection
table_name : str
columns : dict
key: str for column names
value: str for data types
data : list of tuple
Each tuple stands for a row.
from_scratch : bool
If True, drop the table if it exists and create a new one.
If False, only insert values.
Returns
-------
pd.DataFrame
The whole table
"""
with con:
if from_scratch:
con.execute(f"DROP TABLE IF EXISTS {table_name};")
con.execute(f"""
CREATE TABLE {table_name} (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
{re.sub(r"['{}:]", '', str(columns))}
);
""")
sql = f"""
INSERT INTO {table_name} {str_helper(columns.keys())}
VALUES {str_helper(['?'] * len(columns))};
"""
con.executemany(sql, data)
return pd.read_sql(f'SELECT * FROM {table_name};', con)
if __name__ == '__main__':
con = sqlite3.connect('playground.db')
table_name = 'tbl'
columns = {'user_id': 'INTEGER', 'login_time': 'TIMESTAMP'}
data = [
(1, '2021-12-01'),
(1, '2021-12-02'),
(2, '2021-12-01'),
(2, '2021-12-03'),
]
df = insert_data(con, table_name, columns, data)
连续登录 2 天的用户
table_name = 'tbl'
columns = {'user_id': 'INTEGER', 'login_time': 'TIMESTAMP'}
data = [
(1, '2021-12-01 09:00'),
(1, '2021-12-02 09:00'),
(1, '2021-12-04 09:00'),
(2, '2021-12-01 09:00'),
(2, '2021-12-01 12:00'), # different time in the same day
(2, '2021-12-03 09:00'),
]
SELECT
user_id
FROM
(
SELECT
user_id,
login_time,
-- A user can log in multiple times in one day
DENSE_RANK () OVER (PARTITION BY user_id ORDER BY date(login_time)) AS r
FROM
tbl
)
GROUP BY
user_id,
date(login_time, '-' || r || ' days')
HAVING
-- A user can log in multiple times in one day
COUNT(DISTINCT date(login_time)) >= 2
根据登录日期正序排序后, 减去排序号天数归一化日期, 其中 user_id
, date(login_time, '-' || r || ' days')
如下
1 2021-11-30
1 2021-11-30
1 2021-12-01
2 2021-11-30
2 2021-11-30
2 2021-12-01