Introduction:

Today, we are using BeautifulSoup, a Python library that is for parsing HTML/XML documents. What it essentially does is it creates a parse tree which is used to extract data from HTML, which is useful for web scraping. The website we will scrape the NBA teams from will be www.espn.com as follows. By populating a database of information using web scraping and storing it in a database like MongoDB, we can use the data in projects like this data science project.

Import statements:


from bs4 import BeautifulSoup
from pymongo import MongoClient
import pandas as pd
import re
from urllib.request import urlopen

Fetch the divisions and the teams:


url = 'http://www.espn.com/nba/players'
content = urlopen(url).read()
soup = BeautifulSoup(content, 'html.parser')
divisions = soup.find_all('div', class_='mod-header stathead')
divisions = [division.text for division in divisions]
teams = soup.find_all('a', attrs = {'style' : 'padding-top:5px;padding-left:0px;'})
teams = [team.text for team in teams]

Find the conference that the teams are in:


conference = {
                     'Atlantic': 'Eastern Conference',
                     'Central': 'Eastern Conference',
                     'Southeast': 'Eastern Conference',
                     'Pacific': 'Western Conference',
                     'Southwest': 'Western Conference',
                     'Northwest': 'Western Conference'
                 }

team_conference = {}
team_division = {}
counter = 0
division = 0
for team in teams:
   if (counter % 5 == 0) and (counter != 0):
      division += 1
      counter += 1
    
   if re.search('^LA ', team):
      team = re.sub('LA ', 'Los Angeles ', team)
       
team_conference[team] = conference[divisions[division]]
team_division[team] = divisions[division]

Make dataframes using the dicionaries above with team the index and the values are its respective conference and division:


conference_df = pd.DataFrame.from_dict(team_conference, orient='index', columns ['conference'])
division_df = pd.DataFrame.from_dict(team_division, orient='index', columns=['division'])

Merge the conference and division information for each team into one dataframe:


df = pd.merge(conference_df, division_df, left_index=True, right_index=True, how='inner')
df = df.sort_index()
df = df.reset_index(level=0)
df = df.rename({ 'index': '_id' }, axis=1)

Insert into mlab (mongodb database):


db = client['nba-api']   
collection = db['nba_teams']
collection.insert_many(df.to_dict('records'))

Print out if successfully added to the MongoDB database:


print("NBA Teams added to MongoDB sucessfully!")

Main Loop:


if __name__ == "__main__":
    # connect to MongoDB (credentials in external text file for safety reasons)
    file = open('C:/Users/admin/Desktop/credentials.txt', 'r')
    connectionURL = file.readline().strip()
    client = MongoClient(connectionURL)
    
    # fetch information on the nba teams such as their respective conference and division
    teams()

Complete code:


### National Basketball Association Data Cleanser/Scraper
### Scrape data from various basketball sites such as www.basketball-reference.com and store the data in a 
### NoSQL database for the REST API to use.

# import statements
from bs4 import BeautifulSoup
from pymongo import MongoClient
import pandas as pd
import re
from urllib.request import urlopen

# split each data source into seperate components for easier maintenance
def teams():
    # fetch the divisions for each team
    url = 'http://www.espn.com/nba/players'
    content = urlopen(url).read()
    soup = BeautifulSoup(content, 'html.parser')
    divisions = soup.find_all('div', class_='mod-header stathead')
    divisions = [division.text for division in divisions]
    teams = soup.find_all('a', attrs = {'style' : 'padding-top:5px;padding-left:0px;'})
    teams = [team.text for team in teams]
    
    # get the teams respective conference
    conference = {
                     'Atlantic': 'Eastern Conference',
                     'Central': 'Eastern Conference',
                     'Southeast': 'Eastern Conference',
                     'Pacific': 'Western Conference',
                     'Southwest': 'Western Conference',
                     'Northwest': 'Western Conference'
                 }
    team_conference = {}
    team_division = {}
    counter = 0
    division = 0
    for team in teams:
        if (counter % 5 == 0) and (counter != 0):
            division += 1
        counter += 1
    
        if re.search('^LA ', team):
            team = re.sub('LA ', 'Los Angeles ', team)
       
        team_conference[team] = conference[divisions[division]]
        team_division[team] = divisions[division]
    
    # make dataframes using the dicionaries above with team the index and
    # the values are its respective conference and division
    conference_df = pd.DataFrame.from_dict(team_conference, orient='index', columns=['conference'])
    division_df = pd.DataFrame.from_dict(team_division, orient='index', columns=['division'])
    
    # merge the conference and division information for each team into one dataframe
    df = pd.merge(conference_df, division_df, left_index=True, right_index=True, how='inner')
    df = df.sort_index()
    df = df.reset_index(level=0)
    df = df.rename({ 'index': '_id' }, axis=1)

    # insert into mlab (mongodb database)
    db = client['nba-api']   
    collection = db['nba_teams']
    collection.insert_many(df.to_dict('records'))
    
    # print progress
    print("NBA Teams added to MongoDB sucessfully!")
    
if __name__ == "__main__":
    # connect to MongoDB (credentials in external text file for safety reasons)
    file = open('C:/Users/admin/Desktop/credentials.txt', 'r')
    connectionURL = file.readline().strip()
    client = MongoClient(connectionURL)
    
    # fetch information on the nba teams such as their respective conference and division
    teams()

Result of the populated MongoDB database (first 8 tuples):

Web scraping data turned to tuple of information to store in MongoDB database.