How to call an API from PySpark (in workers)

Tested in Databricks

import pyspark.sql.functions as F
import requests

# create dataframe
pokenumbers = [(i,) for i in range(100)]
cols = ["pokenum"]

df_pokenums = spark.createDataFrame(data=pokenumbers, schema=cols)

# call API
def get_name(rows):
    # take the first item in list (API doesn't support batch)
    first = rows[0]
    url = f'https://pokeapi.co/api/v2/pokemon-form/{first.pokenum}'
    try:
        resp = requests.get(url)
        name = resp.json()['pokemon']['name']
    except:
        name = 'did not work'
    return resp.status_code, name

# apply to partitions
df_pokenums.repartition(10).rdd.glom().map(get_name).collect()

Leave a Comment

This site uses Akismet to reduce spam. Learn how your comment data is processed.