Initial Commit

parents
/raw
/imdb_data.sqlite
#!/bin/bash
files=(name.basics title.akas title.basics title.crew title.episode title.principals title.ratings)
mkdir -p raw
for filename in ${files[@]}; do
wget -O - https://datasets.imdbws.com/${filename}.tsv.gz | gzip -d -c > raw/${filename}.tsv
done
#!/bin/bash
rm imdb_data.sqlite
sqlite3 imdb_data.sqlite < schema.sql
python3 import.py
import csv, sqlite3
con = sqlite3.connect("imdb_data.sqlite")
cur = con.cursor()
def batch(iterable, n=1):
i = 0
b = []
for elem in iterable:
if (i >= n):
yield b
i = 0
b = []
i += 1
b.append(elem)
yield b
def processField(key, value):
if value == "\\N":
return None
elif key in ["isAdult", "startYear", "endYear", "runtimeMinutes", "ordering", "isOriginalTitle", "seasonNumber", "episodeNumber", "numVotes"]:
return int(value)
elif key in ["averageRating"]:
return float(value)
else:
return value
def processFile(name, sql):
print("Processing: " + name)
with open("raw/"+name,'r') as fd:
reader = csv.DictReader(fd, delimiter="\t", quoting=csv.QUOTE_NONE)
i = 0
for rows in batch(reader, 1000000):
cur.executemany(sql, [{k:processField(k,v) for (k,v) in row.items()} for row in rows])
con.commit()
i += 1
print("Processed " + str(i) + "M rows")
processFile('title.basics.tsv', 'INSERT INTO title (tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres) VALUES (:tconst, :titleType, :primaryTitle, :originalTitle, :isAdult, :startYear, :endYear, :runtimeMinutes, :genres)')
processFile('name.basics.tsv', 'INSERT INTO name (nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles) VALUES (:nconst, :primaryName, :birthYear, :deathYear, :primaryProfession, :knownForTitles)')
processFile('title.akas.tsv', 'INSERT INTO title_aka (titleId,ordering,title,region,language,types,attributes,isOriginalTitle) VALUES (:titleId, :ordering, :title, :region, :language, :types, :attributes, :isOriginalTitle)')
processFile('title.crew.tsv', 'INSERT INTO title_crew (tconst,directors,writers) VALUES (:tconst, :directors, :writers)')
processFile('title.episode.tsv', 'INSERT INTO title_episode (tconst,parentTconst,seasonNumber,episodeNumber) VALUES (:tconst, :parentTconst, :seasonNumber, :episodeNumber)')
processFile('title.principals.tsv', 'INSERT INTO title_principals (tconst,ordering,nconst,category,job,characters) VALUES (:tconst, :ordering, :nconst, :category, :job, :characters)')
processFile('title.ratings.tsv', 'INSERT INTO title_ratings (tconst,averageRating,numVotes) VALUES (:tconst, :averageRating, :numVotes)')
con.close()
create table name
(
nconst text NOT NULL,
primaryName text,
birthYear text,
deathYear text,
primaryProfession text,
knownForTitles text,
PRIMARY KEY(nconst)
);
create table title
(
tconst text NOT NULL,
titleType text,
primaryTitle text,
originalTitle text,
isAdult integer,
startYear integer,
endYear integer,
runtimeMinutes integer,
genres text,
PRIMARY KEY(tconst)
);
create table title_aka
(
titleId text NOT NULL
references title,
ordering integer NOT NULL,
title text,
region text,
language text,
types text,
attributes text,
isOriginalTitle integer,
PRIMARY KEY(titleId, ordering)
);
create table title_crew
(
tconst text NOT NULL
references title,
directors text,
writers text,
PRIMARY KEY(tconst)
);
create table title_episode
(
tconst text NOT NULL
references title,
parentTconst text NOT NULL
references title,
seasonNumber integer,
episodeNumber integer,
PRIMARY KEY(tconst)
);
create table title_principals
(
tconst text NOT NULL
references title,
ordering integer NOT NULL,
nconst text NOT NULL
references name,
category text,
job text,
characters text,
PRIMARY KEY(tconst, ordering)
);
create table title_ratings
(
tconst text NOT NULL
references title,
averageRating real,
numVotes integer,
PRIMARY KEY(tconst)
);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment