Okay the main issue I am having is I have a table that should not have any duplicate entries. This is because I was hoping to have a Primary Key which would be referenced in a separate table.
From what I understand about normalization it is better to have your database designed in this fashion. So right now I have a table that has a bunch of duplicate entries (there should be only 6 unique entries but there are 30 entries, each unique one is repeated 5 times).
How should I fix this? Should I fix this when importing the data, or by using a UNIQUE keyword. To note when I tried to use the UNIQUE feature this time around, it gave me an error as the data that I am importing indeed has duplicates entries.
EDIT:
This is what my items looks like:
from scrapy.item import Item, Field
class TeamStats(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    team = Field()
    division = Field()
    rosterurl = Field()
    player_desc = Field()
    playerurl = Field()
    pass
class Player(Item):
    exp = Field()
    pass
This is what my code looks like:
import scrapy
import string
import re
from scrapy.selector import HtmlXPathSelector                                                                       ##needed to import xpath command
from scrapy.shell import inspect_response                                                                           ##needed for Response object
from nbastats.items import TeamStats, Player                                                                  ##needed to import player stats
class NbastatsSpider(scrapy.Spider):
    name = "nbaStats"
    start_urls = [
        "http://espn.go.com/nba/teams"                                                                              ##only start not allowed because had some issues when navigated to team roster pages
        ]
    def parse(self,response):
        items = []                                                                                                  ##array or list that stores TeamStats item
        i=0                                                                                                         ##counter needed for older code
        for division in response.xpath('//div[@id="content"]//div[contains(@class, "mod-teams-list-medium")]'):     
            for team in division.xpath('.//div[contains(@class, "mod-content")]//li'):
                item = TeamStats()
                item['division'] = division.xpath('.//div[contains(@class, "mod-header")]/h4/text()').extract()[0]            
                item['team'] = team.xpath('.//h5/a/text()').extract()[0]
                item['rosterurl'] = "http://espn.go.com" + team.xpath('.//div/span[2]/a[3]/@href').extract()[0]
                items.append(item)
                print(item['rosterurl'])
                request = scrapy.Request(item['rosterurl'], callback = self.parseWPNow)
                request.meta['play'] = item
                yield request
    def parseWPNow(self, response):
        item = response.meta['play']
        item = self.parseRoster(item, response)
        return item
    def parseRoster(self, item, response):
        players1 = []
        int = 0
        for players in response.xpath("//td[@class='sortcell']"):
            play = {}
            play['name'] = players.xpath("a/text()").extract()[0]
            play['position'] = players.xpath("following-sibling::td[1]").extract()[0]
            play['age'] = players.xpath("following-sibling::td[2]").extract()[0]
            play['height'] = players.xpath("following-sibling::td[3]").extract()[0]
            play['weight'] = players.xpath("following-sibling::td[4]").extract()[0]
            play['college'] = players.xpath("following-sibling::td[5]").extract()[0]
            play['salary'] = players.xpath("following-sibling::td[6]").extract()[0]
            players1.append(play)
        item['playerurl'] = response.xpath("//td[@class='sortcell']/a").extract()
        item['player_desc']=players1
        return item
This is what my pipeline looks like:
class NbastatsPipeline(object):
    def __init__(self):
        self.setupDBCon()
        self.createTables()
    def setupDBCon(self):
        self.con = lite.connect('test.db')
        self.cur = self.con.cursor()
    def createTables(self):
        self.dropTeamsTable()
        self.dropPlayersTable()
        self.dropDivsTable()
        self.createTeamsTable()
        self.createPlayersTable()
        self.createDivsTable()
    def createTeamsTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Teams(P_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
            team TEXT, \
            DivId INTEGER, \
            FOREIGN KEY (DivId) REFERENCES Divs1(Did) \
            )")
    def createDivsTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Divs(Did INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, \
            division TEXT)")
    def createPlayersTable(self):
        self.cur.execute("CREATE TABLE IF NOT EXISTS Players(player_name TEXT, \
            salary TEXT, \
            weight INTEGER, \
            age INTEGER, \
            college TEXT )")
    def dropTeamsTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Teams")
    def dropPlayersTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Players")
    def dropDivsTable(self):
        self.cur.execute("DROP TABLE IF EXISTS Divs")
    def closeDB(self):
        self.con.close()
    def __del__(self):
        self.closeDB()
    def process_item(self, item, spider):
        for key, value in item.iteritems():
            if key == "division":
                print(item.get('division', ""))
                self.cur.execute("INSERT INTO Divs( division ) VALUES(?)", (item.get('division', ""),))
                self.con.commit() 
#                self.storeInDb(item) #this is the line you'll use when ready to completely pass item through to storeInDb but it'll be lower in code
        return item