Profiling Python Code
Before optimizing, identify bottlenecks using profiling tools:
import cProfile
import pstats
# Function to profile
def slow_function():
total = 0
for i in range(1000000):
total += i * i
return total
# Run profiler
profiler = cProfile.Profile()
profiler.enable()
result = slow_function()
profiler.disable()
# Print stats
stats = pstats.Stats(profiler)
stats.sort_stats('cumtime').print_stats(10)
# Alternative: timeit
import timeit
time_taken = timeit.timeit(slow_function, number=100)
print(f"Average time: {time_taken/100:.4f} seconds")
Profile both CPU-bound and memory usage (with memory_profiler) to identify optimization targets.
Optimizing Data Structures
Choosing the right data structure can dramatically improve performance:
# Lists vs Tuples
from sys import getsizeof
list_data = [1, 2, 3, 4, 5]
tuple_data = (1, 2, 3, 4, 5)
print(f"List size: {getsizeof(list_data)} bytes")
print(f"Tuple size: {getsizeof(tuple_data)} bytes")
# Sets for membership testing
large_list = [i for i in range(1000000)]
large_set = set(large_list)
print("Testing membership...")
%timeit 999999 in large_list # ~10ms
%timeit 999999 in large_set # ~100ns
# Collections module optimizations
from collections import deque, defaultdict
# Faster pops from both ends
dq = deque(range(1000000))
%timeit dq.popleft() # Much faster than list.pop(0)
Prefer tuples for immutable sequences, sets for membership tests, and deque for queue operations.
Leveraging Built-in Functions
Python's built-ins are implemented in C and often much faster than Python equivalents:
# List comprehensions vs loops
def squares_loop(n):
result = []
for i in range(n):
result.append(i * i)
return result
def squares_comp(n):
return [i * i for i in range(n)]
%timeit squares_loop(10000) # ~1.5ms
%timeit squares_comp(10000) # ~1.1ms
# Map and filter
data = range(10000)
%timeit [x * x for x in data if x % 2 == 0] # ~600µs
%timeit list(map(lambda x: x * x, filter(lambda x: x % 2 == 0, data))) # ~800µs
# String joining
words = ['hello'] * 10000
%timeit ''.join(words) # ~100µs
%timeit result = ''; for w in words: result += w # ~2ms
Built-ins like map(), filter(), and ''.join() are optimized at the C level for better performance.
NumPy and C Extensions
For numerical computing, NumPy provides array operations that are orders of magnitude faster:
import numpy as np
import array
# Native Python list
py_list = [i for i in range(1000000)]
%timeit [x * 2 for x in py_list] # ~100ms
# Python array
py_array = array.array('i', py_list)
%timeit array.array('i', [x * 2 for x in py_array]) # ~90ms
# NumPy array
np_array = np.arange(1000000)
%timeit np_array * 2 # ~1ms
# Cython example (separate .pyx file)
'''
# cython: language_level=3
def cython_sum(arr):
cdef long total = 0
cdef int i
for i in range(len(arr)):
total += arr[i]
return total
'''
# After compiling, can be 100x faster than Python
For numerical work, NumPy arrays and C extensions (Cython, ctypes) can provide massive speedups.
Concurrency and Parallelism
Choose the right approach based on your task type:
# I/O-bound: asyncio
import asyncio
async def fetch_data(url):
# Simulate network request
await asyncio.sleep(1)
return f"data from {url}"
async def main():
tasks = [fetch_data(f"url-{i}") for i in range(10)]
results = await asyncio.gather(*tasks)
print(results)
# CPU-bound: multiprocessing
from multiprocessing import Pool
def cpu_intensive(n):
return sum(i * i for i in range(n))
if __name__ == '__main__':
with Pool() as pool:
results = pool.map(cpu_intensive, range(1000, 1100))
print(results[:5])
# Mixed: concurrent.futures
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def mixed_work(task):
if task['type'] == 'io':
# I/O work
else:
# CPU work
return result
with ThreadPoolExecutor() as executor:
results = list(executor.map(mixed_work, tasks))
Use asyncio for I/O-bound, multiprocessing for CPU-bound, and ThreadPool for mixed workloads.
Python Performance Videos
Master Python performance optimization with these handpicked YouTube tutorials:
Learn to identify bottlenecks:
Speed up your Python code:
High-performance computing:
Parallel processing techniques: