Install needed package

if (!require("RMySQL")) install.packages("RMySQL")
## Loading required package: RMySQL
## Warning: package 'RMySQL' was built under R version 4.0.3
## Loading required package: DBI
if (!require("splitstackshape")) install.packages("splitstackshape")
## Loading required package: splitstackshape
## Warning: package 'splitstackshape' was built under R version 4.0.3

Practicum 2

Due to the time consuming issue I only import 100,100 rows here. All of the following functionality are tested to be able to scale to any number of rows. If you want to test all data feel free to change nrows.

testNames <- read.csv(file = "name.basics.tsv.gz", sep = '\t', header = TRUE, nrows = 10000)
testNames
testTitle <- read.csv(file="title.basics.tsv.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE, nrows = 100000)
testTitle
testEpisode <- read.csv(file="title.episode.tsv.gz", sep = '\t', header = TRUE, fill = TRUE, nrow = 10000)
testEpisode
testRating <- read.csv(file="title.ratings.tsv.gz", sep = '\t', header = TRUE, stringsAsFactors = FALSE, nrow = 10000)
testRating

2. ERD diagram

A. Tool used Used LucidaChart to create.
B. Places modified Added many-to-many associate table between title and names. normalized multi-valued attributes of generes, professions, and knowForTitle. Created lookup table for title type.
C. Attribute added Added age and number of movies to Name Table.
D. In at least BCNF Attributes in single value. No functional dependency between non-primray key attribute. All attributes only functional dependent on primary key. No redundency based on functional dependency.
ERD

Connect to MySQL

library(RMySQL)
db_user <- 'root'
db_password <- 'xxx'
db_name <- 'practicum2'
db_host <- 'localhost' # for local access
db_port <- 3306

# 3. Read data from db
mydb <-  dbConnect(MySQL(), user = db_user, password = db_password,
                 dbname = db_name, host = db_host, port = db_port)

3. Create tables

createTitleType <- "
create table if not EXISTS TitleType (
    tType varchar(360) primary key
);"

createTitle <- "
CREATE TABLE if not EXISTS Title (
    tconst varchar(360) primary key,
    titleType varchar(360),
    primaryTitle varchar(360),
    originalTitle varchar(360),
    isAdult boolean,
    startYear int(4),
    endYear int(4),
    runtimeMinutes int,
    foreign key (titleType) references TitleType(tType) 
);"

createGenre <- "
create table if not EXISTS Genre (
    genre varchar(360) primary key
);
"
createTitle_Generes <- "
create table if not EXISTS Title_Genre (
    tgId int primary key auto_increment,
    tconst varchar(360),
    genre varchar(360),
    foreign key (tconst) references Title(tconst) on delete cascade on update cascade,
    foreign key (genre) references Genre(genre) on delete cascade on update cascade
);
"

createEpisode <-"
create table if not EXISTS Episode (
    tconst varchar(360) primary key,
    parentTconst varchar(360),
    seasonNumber int,
    episodeNumber int,
    foreign key (parentTconst) references Title(tconst) on delete cascade on update cascade
);"

createRating <-"
create table if not EXISTS Rating (
    tconst varchar(360) primary key,
    averageRating float(2),
    numVotes bigint,
    foreign key (tconst) references Title(tconst) on delete cascade on update cascade
);
"

createNameBasic <- "
create table if not EXISTS NameBasic (
    nconst varchar(360) primary key,
    primaryName varchar(360) not null,
    birthYear int default 0,
    deathYear int default 0,
    age int default 0,
    numberOfMovie int default 0
);
"

createKnownForTitle <- "
create table if not EXISTS Name_Title (
    ntId int primary key auto_increment,
    nconst varchar(360),
    tconst varchar(360),
    foreign key (tconst) references Title(tconst),
    foreign key (nconst) references NameBasic(nconst)
);"

createProfession <- "
create table if not EXISTS Profession (
    profession varchar(360) primary key
);"

createName_Profession <-"
create table if not EXISTS Name_Profession (
    npid int primary key auto_increment,
    nconst varchar(360),
    profession varchar(360),
    foreign key (profession) references Profession(profession) on delete cascade on update cascade,
    foreign key (nconst) references NameBasic(nconst) on delete restrict on update cascade
);"

dbSendQuery(mydb, createTitleType)
## <MySQLResult:812938274,0,0>
dbSendQuery(mydb, createTitle)
## <MySQLResult:812938274,0,1>
dbSendQuery(mydb, createGenre)
## <MySQLResult:812938274,0,2>
dbSendQuery(mydb, createEpisode)
## <MySQLResult:812938274,0,3>
dbSendQuery(mydb, createRating)
## <MySQLResult:812938274,0,4>
dbSendQuery(mydb, createNameBasic)
## <MySQLResult:812938274,0,5>
dbSendQuery(mydb, createKnownForTitle)
## <MySQLResult:812938274,0,6>
dbSendQuery(mydb, createProfession)
## <MySQLResult:812938274,0,7>
dbSendQuery(mydb, createName_Profession)
## <MySQLResult:812938274,0,8>
res <- dbSendQuery(mydb, "SELECT * FROM information_schema.columns WHERE table_schema = 'practicum2';")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 4 imported as
## numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 10 imported as
## numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 11 imported as
## numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 12 imported as
## numeric
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 21 imported as
## numeric
res <- fetch(res, n=-1)
res

4. Insert Data to Mysql

Referential Integraty: When insert datat that requires foreign key constrainst. Data insertion will fail if the inserted foreign key does not exisists in reference table. Therefore Before insertion we need to check that the foreign key does exsists.
Solution: Loop through all data before insert to make sure that all the referencing keys are in the referenced table otherwise only the rows that pass referential integraty will be inserted. This requires a lot of time to acheive.

TitleType table

#Get all titleType
tType <- testTitle[,"titleType"]
tType <- unique(tType)
tType <- data.frame(tType)

dbWriteTable(mydb, value = tType, name="titletype", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from titletype;")
res <- fetch(res, n=-1)
res

Title Table

#Remove unnecessary colunms
titles <- testTitle[, !(names(testTitle) %in% "genres")]

titles[titles == "\\N"] <- ""

dbWriteTable(mydb, value = titles, name="title", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from title;")
res <- fetch(res, n=-1)
res

Genre Table

#Get all genre type
genres <- as.vector(testTitle[, "genres"])
genres <- unlist(strsplit(genres, ","))
genres <- unique(genres)
genre <- genres[!(genres %in% "\\N")]

genres <- data.frame(genre)

#Write to DB
dbWriteTable(mydb, value = genres, name = "genre", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from genre;")
res <- fetch(res, n=-1)
res

Title_Genre table

#Make genres single valued
library(splitstackshape)
keep <- c("tconst", "genres")
titleG <- testTitle[keep]
titleG <- cSplit(titleG, splitCols=2, sep = ",", direction = "long")[!is.na(genres)]
colnames(titleG) <- c("tconst", "genre")
titleG$tgId <- seq.int(nrow(titleG))
titleG <- titleG[,c(3, 1:2)]

#Write to DB
dbWriteTable(mydb, value = titleG, name = "title_genre", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from title_genre;")
res <- fetch(res, n=-1)
res

Episode Table

#need to clear foreign key

testEpisode[testEpisode == "\\N"] <- ""

#Write to DB
dbWriteTable(mydb, value = testEpisode, name = "episode", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from episode;")
res <- fetch(res, n=-1)
res

Rating Table

#Write to DB
dbWriteTable(mydb, value = testRating, name = "rating", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from rating;")
res <- fetch(res, n=-1)
res

NameBasic Table

#Drop unecessray column and \\n
drops <- c("primaryProfession", "knownForTitles")
testNames1 <- testNames[ , !(names(testNames) %in% drops)]
testNames1[testNames1 == "\\N"] <- "" 

#Write to DB
dbWriteTable(mydb, value = testNames1, name = "namebasic", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from namebasic;")
res <- fetch(res, n=-1)
res

Name_Title

#Create Many-to-Many relationship
library(splitstackshape)
keep <- c("nconst", "knownForTitles")
nameTitle <- testNames[keep]
nameTitle <- cSplit(nameTitle, splitCols=2, sep = ",", direction = "long")[!is.na(knownForTitles)]
colnames(nameTitle) <- c("nconst", "tconst")
nameTitle$ntId <- seq.int(nrow(nameTitle))
nameTitle <- nameTitle[,c(3, 1:2)]

#Write to DB
dbWriteTable(mydb, value = nameTitle, name = "name_title", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from name_title;")
res <- fetch(res, n=-1)
res

Profession Table

#Get all professions
profession <- as.vector(testNames[, "primaryProfession"])
profession <- unlist(strsplit(profession, ","))
profession <- unique(profession)

profession <- data.frame(profession)

#Write to DB
dbWriteTable(mydb, value = profession, name = "profession", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from profession;")
res <- fetch(res, n=-1)
res

Name_Profession Table

#Create many-to-many relationship
library(splitstackshape)
keep <- c("nconst", "primaryProfession")
nameProfession <- testNames[keep]
nameProfession <- cSplit(nameProfession, splitCols=2, sep = ",", direction = "long")[!is.na(primaryProfession)]
colnames(nameProfession) <- c("nconst", "profession")
nameProfession$npId <- seq.int(nrow(nameProfession))
nameProfession <- nameProfession[,c(3, 1:2)]

#Write to DB
dbWriteTable(mydb, value = nameProfession, name = "name_profession", append = TRUE, row.names = FALSE)
## [1] TRUE
res <- dbSendQuery(mydb, "select * from name_profession;")
res <- fetch(res, n=-1)
res

5. Update

Number of moie is defined by find all Known for title (all series filmed) and only keep series which titleType are movie.

#fetch from database
nameFromDB <- dbSendQuery(mydb, "select * from namebasic;")
nameFromDB <- fetch(nameFromDB, n=-1)
titleFromDB <- dbSendQuery(mydb, "select * from title;")
titleFromDB <- fetch(titleFromDB, n=-1)

#find age and number of movie
movieNum <- merge(nameTitle, testTitle, by="tconst")
movieNum <- movieNum[movieNum$titleType == "movie"]
year <- format(Sys.Date(), "%Y")
year <- as.numeric(year)

for (i in 1:nrow(nameFromDB)) {
  temp <- nameFromDB[i,]
  count <- nrow(movieNum[movieNum$nconst == temp$nconst,])
  temp$numberOfMovie <- count
  if(temp$birthYear == 0 ){ temp$age <- 0 }
  else if(temp$deathYear == 0) {
    temp$age <- year-temp$birthYear
  }else{
    temp$age <- temp$deathYear-temp$birthYear
  }
#update database
  query <- paste0("update namebasic set age= ", temp$age, ", numberOfMovie = ", temp$numberOfMovie, " where nconst = '", temp$nconst, "';")
  dbSendQuery(mydb, query)
}
res <- dbSendQuery(mydb, "select * from namebasic;")
res <- fetch(res, n=-1)
res

Notice: Since delimiters are not allowed in dbsendquery, they are eliminated in the following trigger and stored procedure queries. Thus the notebook can compile properly. In the acutaly MySQL delimiters are needed.

6. Trigger

insert to actor will not affect number of movie since it is a value realted to link table with title, which should not be modified if no new associate movie is inserted. Number of movie will only increase when linked product is a titleType of movie.

ageTrigger <- "
CREATE TRIGGER insertActorAge
before INSERT
ON namebasic FOR EACH ROW
BEGIN    
    if new.birthYear !=0 then
        if new.deathYear = 0 then set new.age = year(curdate()) - new.birthYear; End if;
        if new.deathYear != 0 then set new.age = new.deathYear - new.birthYear; end if;
    end if;
    set new.numberOfMovie = 0;
END;"

movieCountTrigger <- "

CREATE TRIGGER insertActorMovie
after INSERT
ON name_title FOR EACH ROW
BEGIN
    if (select titleType from title where title.tconst = new.tconst) = 'movie' then
    update namebasic set numberOfMovie = numberOfMovie+1 where new.nconst = namebasic.nconst;
    end if;
END;
"

#send query to db
dbSendQuery(mydb, ageTrigger)
## <MySQLResult:310341024,0,10044>
dbSendQuery(mydb, movieCountTrigger)
## <MySQLResult:310339792,0,10045>

insertActor insertMovie

7. View

actorView <- "
CREATE VIEW actors AS
    SELECT 
        primaryName, age, if (deathYear !=0, 'alive', 'dead') as isAlive, numberOfMovie 
    FROM
        namebasic;"

#send query to db
dbSendQuery(mydb, actorView)
## <MySQLResult:510189192,0,10046>

view

8. Season Number

I created two different queries. First one will select the max season number as season numeber for a serie this query run way faster than the second one. However, some season number is empty or not labeled by numbe. For example national geographic special labledd season by year so, the first query will omit them therefore I created second query to count each unique season. The second query takes a lot more time to run.

seasonNum <- "
SELECT max(seasonNumber) as seasonNum, primaryTitle FROM practicum2.episode as e join title as t on e.parentTconst = t.tconst group by primaryTitle;"

seasonNum2 <- "
SELECT count(distinct seasonNumber) as seasonNum, primaryTitle FROM practicum2.episode as e join title as t on e.parentTconst = t.tconst group by primaryTitle;
"

#send query to db
seasonNumRes <- dbSendQuery(mydb, seasonNum)
seasonNumRes <- fetch(seasonNumRes, n=-1)
seasonNumRes <- seasonNumRes[seasonNumRes$seasonNum <40, ]

hist(seasonNumRes$seasonNum, breaks = seq(0,40,1), xlab = "season Number")

9.Add Actor

Used transaction to make sure data finishes in one transaction

addActor <- "


CREATE PROCEDURE addActor(
    in nconst VARCHAR(360),
    in primaryName varchar(360),
    in birthYear int,
    in deathYear int,
    in profession varchar(360),
    in knownForTitle varchar(360)
)
BEGIN

    DECLARE errno INT;
    DECLARE EXIT HANDLER FOR SQLEXCEPTION
    BEGIN
    GET CURRENT DIAGNOSTICS CONDITION 1 errno = MYSQL_ERRNO;
    SELECT errno AS MYSQL_ERROR;
    ROLLBACK;
    END;
    
    start transaction;
    
    insert into namebasic value(nconst, primaryName, birthYear, deathYear, 0, 0);
    if profession is not null then
        if (select count(*) from profession as p where p.profession = profession) = 0
            then insert into profession value(profession); end if;
    insert into name_profession value(null, nconst, profession);
    end if;
    if knownForTitle is not null then
    insert into name_title value(null, nconst, knownForTitle);
    end if;
    
    commit work;
END;
"

#send query to db
dbSendQuery(mydb, addActor)
## <MySQLResult:310340184,0,10048>

addActor

10. Delete Actor

deleteActor <- "


CREATE PROCEDURE deleteActor(
    in nconst VARCHAR(360)
)
BEGIN

    DECLARE errno INT;
    DECLARE EXIT HANDLER FOR SQLEXCEPTION
    BEGIN
    GET CURRENT DIAGNOSTICS CONDITION 1 errno = MYSQL_ERRNO;
    SELECT errno AS MYSQL_ERROR;
    ROLLBACK;
    END;
    
    start transaction;
        delete from name_profession where name_profession.nconst = nconst;
        delete from name_title where name_title.nconst = nconst;
        delete from namebasic where namebasic.nconst = nconst;
    commit work;
END;"

#send query to db
dbSendQuery(mydb, deleteActor)
## <MySQLResult:514325272,0,10049>

deleteActor

11. Find actors that have more than 2 movies above average rating

Used two version
groupBy: count all the occurence of movies above avarage rating (slow)
whereClause: utilized numberOfMoive column, should be use if the data is ensured to be correct (fast)

groupBy <- "
select primaryName, age from namebasic as nb join name_title as nt on nb.nconst = nt.nconst
    join title as t on t.tconst = nt.tconst join rating as r on r.tconst = t.tconst where t.titleType = 'movie' and averageRating > (select avg(averageRating) from rating) group by nb.nconst having count(nb.nconst) > 2;
"

whereClause <- "
select distinct primaryName, age from namebasic as nb join name_title as nt on nb.nconst = nt.nconst
    join title as t on t.tconst = nt.tconst join rating as r on r.tconst = t.tconst where titleType = 'movie' and averageRating > (select avg(averageRating) from rating) and numberOfMovie>2;"

#send query to db
actor1Res <- dbSendQuery(mydb, groupBy)
actor1Res <- fetch(actor1Res, n=-1)
actor1Res
actor2Res <- dbSendQuery(mydb, whereClause)
actor2Res <- fetch(actor2Res, n=-1)
actor2Res

12. Index performance

The reason that select is faster with index is that after creating index it creates a sorted b tree which reduce the look up time to approximately Theta(1). Depending on how the tree structed. It may need more look up which still reduce time significantly probably to O(logN).

#create index
dbSendQuery(mydb, "create index primaryName on namebasic (primaryName);")
## <MySQLResult:310340184,0,10052>
#timer
start.time <- Sys.time()
query2 <- dbSendQuery(mydb, "SELECT * FROM practicum2.namebasic where primaryName = 'James L. Brooks';
")
end.time <- Sys.time()
query2 <- fetch(query2, n=-1)
time2 <- end.time - start.time

#drop index
dbSendQuery(mydb, "DROP INDEX primaryName ON namebasic;")
## <MySQLResult:310339792,0,10054>
start.time <- Sys.time()
query1 <- dbSendQuery(mydb, "SELECT * FROM practicum2.namebasic where primaryName = 'James L. Brooks';
")
end.time <- Sys.time()
query1 <- fetch(query1, n=-1)
time1 <- end.time - start.time

time1 <- as.numeric(time1)
time2 <- as.numeric(time2)

h <- c(time1, time2)
barplot(h, names.arg = c("without index", "with index"), ylab = "time in sec")

13. Index with insertion

Since after index created it will keep an sorted B-tree. Therefore, everytime the insersion happens, the inserted row needs to be in the correct posisiton which means B-tree needs to be checked for its invariance. Therefore, more index means more B-tree means more verification or restructure time.

#create index1
dbSendQuery(mydb, "create index startYear on title(startYear);")
## <MySQLResult:1919239970,0,10056>
#timer
start.time <- Sys.time()
query1 <- dbSendQuery(mydb, "insert into title value('tt9000000', 'short', 'safa', 'sdafb', 0, 2000, 0, 100);
")
end.time <- Sys.time()
time1 <- end.time - start.time

#create index2
dbSendQuery(mydb, "create index primaryTitle  on title(primaryTitle);")
## <MySQLResult:0,0,10058>
#timer
start.time <- Sys.time()
query2 <- dbSendQuery(mydb, "insert into title value('tt9000001', 'short', 'safa', 'sdafb', 0, 2000, 0, 100);
")
end.time <- Sys.time()
time2 <- end.time - start.time

#create index3
dbSendQuery(mydb, "create index originalTitle on title(originalTitle);")
## <MySQLResult:1,0,10060>
#timer
start.time <- Sys.time()
query3 <- dbSendQuery(mydb, "insert into title value('tt9000003', 'short', 'safa', 'sdafb', 0, 2000, 0, 100);
")
end.time <- Sys.time()
time3 <- end.time - start.time


#without index
dbSendQuery(mydb, "drop index startYear on title;")
## <MySQLResult:0,0,10062>
dbSendQuery(mydb, "drop index primaryTitle on title;")
## <MySQLResult:318730984,0,10063>
dbSendQuery(mydb, "drop index originalTitle on title;")
## <MySQLResult:356604408,0,10064>
#timer
start.time <- Sys.time()
query4 <- dbSendQuery(mydb, "insert into title value('tt9000004', 'short', 'safa', 'sdafb', 0, 2000, 0, 100);
")
end.time <- Sys.time()
time4 <- end.time - start.time

h <- c(as.numeric(time4), as.numeric(time1),as.numeric(time2), as.numeric(time3))
x <- c(0,1,2,3)
plot(x, h, type = "o", ylab = "excution time in sec", xaxt='n')
axis(1,at=c(0,1,2,3),labels=c( "no index", "1 index", "2 index", "3 index"))

dbClearResult(dbListResults(mydb)[[1]])
## [1] TRUE

Disconnect MySQL

on.exit(dbDisconnect(mydb))