Quick Start

This guide will get you up and running with chDB in minutes.

Your First Query

Let’s start with the simplest example:

import chdb

# Your first query
result = chdb.query("SELECT 1 as id, 'Hello World' as message", "CSV")
print(result)

Output:

1,Hello World

Output Formats

chDB supports multiple output formats for different use cases:

CSV (Default)

result = chdb.query("SELECT 1, 'test'", "CSV")
print(result)  # CSV string

DataFrame (Pandas)

import chdb

df = chdb.query("SELECT number, number*2 as doubled FROM numbers(5)", "DataFrame")
print(type(df))  # <class 'pandas.core.frame.DataFrame'>
print(df.head())

Arrow Table

table = chdb.query("SELECT number FROM numbers(1000)", "ArrowTable")
print(type(table))  # <class 'pyarrow.lib.Table'>
print(f"Rows: {len(table)}")

Pretty Format

result = chdb.query("""
    SELECT
        'Alice' as name, 25 as age
    UNION ALL
    SELECT 'Bob', 30
""", "Pretty")
print(result)

Working with Files

chDB can query 70+ file formats directly:

CSV Files

# Query a local CSV file
result = chdb.query("""
    SELECT count(*), avg(column_name)
    FROM file('data.csv', 'CSV')
""")

JSON Files

# Query JSON data
result = chdb.query("""
    SELECT * FROM file('data.json', 'JSONEachRow')
    WHERE field > 100
    LIMIT 10
""")

Parquet Files

# Efficient querying of Parquet files
result = chdb.query("""
    SELECT department, sum(salary) as total_salary
    FROM file('employees.parquet', 'Parquet')
    GROUP BY department
    ORDER BY total_salary DESC
""")

DataFrame Integration

Query pandas DataFrames directly:

import pandas as pd
import chdb

# Create sample DataFrame
df = pd.DataFrame({
    'product': ['A', 'B', 'C', 'A', 'B'],
    'sales': [100, 200, 150, 300, 250],
    'region': ['North', 'South', 'North', 'South', 'North']
})

# Query the DataFrame using chDB
result = chdb.query("""
    SELECT
        product,
        region,
        sum(sales) as total_sales,
        avg(sales) as avg_sales
    FROM Python(df)
    GROUP BY product, region
    ORDER BY total_sales DESC
""", "DataFrame")

print(result)

Memory vs Persistent Storage

In-Memory (Default)

Perfect for data analysis and temporary operations:

# All data stays in memory
result = chdb.query("""
    SELECT number, number^2 as squared
    FROM numbers(1000000)
    WHERE number % 1000 = 0
""")

Persistent Storage

For data that needs to persist between sessions:

# Create a persistent database
conn = chdb.connect("my_database.chdb")
cur = conn.cursor()

# Create and populate table
cur.execute("""
    CREATE TABLE IF NOT EXISTS users (
        id UInt32,
        name String,
        email String
    ) ENGINE = MergeTree() ORDER BY id
""")

cur.execute("INSERT INTO users VALUES (1, 'Alice', 'alice@example.com')")
cur.execute("INSERT INTO users VALUES (2, 'Bob', 'bob@example.com')")

# Query the persistent data
cur.execute("SELECT * FROM users ORDER BY id")
for row in cur:
    print(row)

conn.close()

Performance Tips

Use Connection Objects for Multiple Queries

# More efficient for multiple queries
conn = chdb.connect()
cur = conn.cursor()

for i in range(100):
    cur.execute(f"SELECT {i} as iteration")
    result = cur.fetchone()

conn.close()

Error Handling

Handle errors gracefully:

import chdb

try:
    result = chdb.query("SELECT invalid_column FROM non_existent_table")
except chdb.ChdbError as e:
    print(f"Query error: {e}")
except Exception as e:
    print(f"Unexpected error: {e}")

Next Steps

Now that you’re familiar with the basics: