Skip to content
Snippets Groups Projects
Verified Commit 5f9eb31c authored by Janne Mareike Koschinski's avatar Janne Mareike Koschinski
Browse files

Initial Commit

parents
Branches
No related tags found
No related merge requests found
/raw
/imdb_data.sqlite
#!/bin/bash
files=(name.basics title.akas title.basics title.crew title.episode title.principals title.ratings)
mkdir -p raw
for filename in ${files[@]}; do
wget -O - https://datasets.imdbws.com/${filename}.tsv.gz | gzip -d -c > raw/${filename}.tsv
done
#!/bin/bash
rm imdb_data.sqlite
sqlite3 imdb_data.sqlite < schema.sql
python3 import.py
import csv, sqlite3
con = sqlite3.connect("imdb_data.sqlite")
cur = con.cursor()
def batch(iterable, n=1):
i = 0
b = []
for elem in iterable:
if (i >= n):
yield b
i = 0
b = []
i += 1
b.append(elem)
yield b
def processField(key, value):
if value == "\\N":
return None
elif key in ["isAdult", "startYear", "endYear", "runtimeMinutes", "ordering", "isOriginalTitle", "seasonNumber", "episodeNumber", "numVotes"]:
return int(value)
elif key in ["averageRating"]:
return float(value)
else:
return value
def processFile(name, sql):
print("Processing: " + name)
with open("raw/"+name,'r') as fd:
reader = csv.DictReader(fd, delimiter="\t", quoting=csv.QUOTE_NONE)
i = 0
for rows in batch(reader, 1000000):
cur.executemany(sql, [{k:processField(k,v) for (k,v) in row.items()} for row in rows])
con.commit()
i += 1
print("Processed " + str(i) + "M rows")
processFile('title.basics.tsv', 'INSERT INTO title (tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres) VALUES (:tconst, :titleType, :primaryTitle, :originalTitle, :isAdult, :startYear, :endYear, :runtimeMinutes, :genres)')
processFile('name.basics.tsv', 'INSERT INTO name (nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles) VALUES (:nconst, :primaryName, :birthYear, :deathYear, :primaryProfession, :knownForTitles)')
processFile('title.akas.tsv', 'INSERT INTO title_aka (titleId,ordering,title,region,language,types,attributes,isOriginalTitle) VALUES (:titleId, :ordering, :title, :region, :language, :types, :attributes, :isOriginalTitle)')
processFile('title.crew.tsv', 'INSERT INTO title_crew (tconst,directors,writers) VALUES (:tconst, :directors, :writers)')
processFile('title.episode.tsv', 'INSERT INTO title_episode (tconst,parentTconst,seasonNumber,episodeNumber) VALUES (:tconst, :parentTconst, :seasonNumber, :episodeNumber)')
processFile('title.principals.tsv', 'INSERT INTO title_principals (tconst,ordering,nconst,category,job,characters) VALUES (:tconst, :ordering, :nconst, :category, :job, :characters)')
processFile('title.ratings.tsv', 'INSERT INTO title_ratings (tconst,averageRating,numVotes) VALUES (:tconst, :averageRating, :numVotes)')
con.close()
create table name
(
nconst text NOT NULL,
primaryName text,
birthYear text,
deathYear text,
primaryProfession text,
knownForTitles text,
PRIMARY KEY(nconst)
);
create table title
(
tconst text NOT NULL,
titleType text,
primaryTitle text,
originalTitle text,
isAdult integer,
startYear integer,
endYear integer,
runtimeMinutes integer,
genres text,
PRIMARY KEY(tconst)
);
create table title_aka
(
titleId text NOT NULL
references title,
ordering integer NOT NULL,
title text,
region text,
language text,
types text,
attributes text,
isOriginalTitle integer,
PRIMARY KEY(titleId, ordering)
);
create table title_crew
(
tconst text NOT NULL
references title,
directors text,
writers text,
PRIMARY KEY(tconst)
);
create table title_episode
(
tconst text NOT NULL
references title,
parentTconst text NOT NULL
references title,
seasonNumber integer,
episodeNumber integer,
PRIMARY KEY(tconst)
);
create table title_principals
(
tconst text NOT NULL
references title,
ordering integer NOT NULL,
nconst text NOT NULL
references name,
category text,
job text,
characters text,
PRIMARY KEY(tconst, ordering)
);
create table title_ratings
(
tconst text NOT NULL
references title,
averageRating real,
numVotes integer,
PRIMARY KEY(tconst)
);
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment