Added: Remove Duplicates
Fixes #4
This commit is contained in:
Carlos Sousa
2021-08-06 01:22:44 +02:00
committed by GitHub
5 changed files with 400 additions and 240 deletions

View File

@@ -5,10 +5,11 @@ services:
image: zebrajr/imdbscrapper:latest
volumes:
#- '${PWD}/src:/opt/imdbscrapper:ro'
- './src/scrapper/:/opt/imdbscrapper'
- './src/scrapper/:/opt/imdbscrapper:ro'
user: 1000:1000
environment:
- START_URL=910000
- removeDuplicates=1
- START_URL=2000000
- END_URL=0
- STEPUPCYCLE=50
- PROCESSES=5

View File

@@ -3,7 +3,7 @@
-- https://www.phpmyadmin.net/
--
-- Host: imdbdb
-- Generation Time: Jul 29, 2021 at 06:24 PM
-- Generation Time: Aug 05, 2021 at 11:13 PM
-- Server version: 10.6.3-MariaDB-1:10.6.3+maria~focal
-- PHP Version: 7.4.1
@@ -52,6 +52,50 @@ CREATE DEFINER=`root`@`%` PROCEDURE `checkDuplicateSerie` (IN `idCheck` BIGINT(2
WHERE series.idSerie = idCheck;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `getDuplicateMovies` () BEGIN
SELECT movies.idMovie, movies.name
FROM movies
GROUP BY movies.name
HAVING COUNT(movies.name) > 1;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `getDuplicateSeries` () BEGIN
SELECT series.idSerie, series.name
FROM series
GROUP BY series.name
HAVING COUNT(series.name) > 1;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `getMovieByName` (IN `movieName` VARCHAR(255)) BEGIN
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
FROM movies
WHERE movies.name = movieName;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `getMovies` (IN `valueRating` DOUBLE, IN `valueRatingCount` BIGINT(20), IN `valueReleaseDate` DATE) BEGIN
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
FROM movies
WHERE movies.rating >= valueRating
AND movies.ratingCount >= valueRatingCount
AND movies.releaseDate >= valueReleaseDate
ORDER by movies.rating DESC, movies.ratingCount DESC;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `getSerieByName` (IN `serieName` VARCHAR(255)) BEGIN
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
FROM series
WHERE series.name = serieName;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `getSeries` (IN `valueRating` DOUBLE, IN `valueRatingCount` BIGINT(20), IN `valueReleaseDate` DATE) BEGIN
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
FROM series
WHERE series.rating >= valueRating
AND series.ratingCount >= valueRatingCount
AND series.releaseDate >= valueReleaseDate
ORDER by series.rating DESC, series.ratingCount DESC;
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `insertIgnore` (IN `inIDIgnore` BIGINT(20)) BEGIN
INSERT INTO ignoreList
(`idIgnore`)
@@ -88,22 +132,58 @@ CREATE DEFINER=`root`@`%` PROCEDURE `insertSerieGenre` (IN `idSerie` BIGINT(20),
VALUES(idSerie, idGenre);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveMovieByName` (`movieName` VARCHAR(255)) BEGIN
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
FROM movies
WHERE movies.name = movieName;
CREATE DEFINER=`root`@`%` PROCEDURE `removeDuplicateMovie` (IN `inMovie` BIGINT(20), IN `inName` VARCHAR(255)) BEGIN
DELETE FROM movies
WHERE movies.idMovie = inMovie;
DELETE FROM moviesGenre
WHERE moviesGenre.idMovie = inMovie;
INSERT INTO duplicateMovies
(duplicateMovies.idMovie, duplicateMovies.nameMovie)
VALUES (inMovie, inName);
INSERT INTO ignoreList
(ignoreList.idIgnore)
VALUES (inMovie);
END$$
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveSerieByName` (`serieName` VARCHAR(255)) BEGIN
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
FROM series
WHERE series.name = serieName;
CREATE DEFINER=`root`@`%` PROCEDURE `removeDuplicateSerie` (IN `inSerie` BIGINT(20), IN `inName` VARCHAR(255)) BEGIN
DELETE FROM series
WHERE series.idSerie = inSerie;
DELETE FROM seriesGenre
WHERE seriesGenre.idSerie = inSerie;
INSERT INTO duplicateSeries
(duplicateSeries.idSerie, duplicateSeries.nameSerie)
VALUES (inSerie, inName);
INSERT INTO ignoreList
(ignoreList.idIgnore)
VALUES (inSerie);
END$$
DELIMITER ;
-- --------------------------------------------------------
--
-- Table structure for table `duplicateMovies`
--
CREATE TABLE `duplicateMovies` (
`idMovie` bigint(20) NOT NULL,
`nameMovie` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `duplicateSeries`
--
CREATE TABLE `duplicateSeries` (
`idSerie` bigint(20) NOT NULL,
`nameSerie` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
-- --------------------------------------------------------
--
-- Table structure for table `ignoreList`
--
@@ -182,6 +262,18 @@ CREATE TABLE `seriesGenre` (
-- Indexes for dumped tables
--
--
-- Indexes for table `duplicateMovies`
--
ALTER TABLE `duplicateMovies`
ADD PRIMARY KEY (`idMovie`);
--
-- Indexes for table `duplicateSeries`
--
ALTER TABLE `duplicateSeries`
ADD PRIMARY KEY (`idSerie`);
--
-- Indexes for table `ignoreList`
--

View File

@@ -0,0 +1,265 @@
import mysql.connector as mariadb
from time import sleep
# Creates and returns a mariadb connection object
def createDBConnection():
mydb = mariadb.connect(
host = 'imdbdb',
user = 'root',
password = 'secret',
database = 'imdbscrapper'
)
return mydb
'''
Function to save data from lists (eg: movieTable) to file (eg: moviesFile)
'''
def saveToFile(dataTable, dataPath):
# Assume we couldn't open / write to file
sucessTest = False
while sucessTest != True:
# Try to open the file and append to it
# [ToDo:] change try position to fix duplicate entries
try:
f = open(dataPath, "a")
for row in dataTable:
# Initialize an empty Output String
outputStr = ""
# For each value in the list, add it as string to the output, separate each by ";"
for index, value in enumerate(row):
outputStr += str(value) + ";"
# Adds the new line (\n) to the string, writes to the file, prints to the screen
outputStr += "\n"
f.write(outputStr)
print(row)
f.close()
# If sucess, then file can move on
sucessTest = True
except Exception as e:
print("Retrying updating - %s - %s" % (dataPath, e))
'''
Function to check for duplicate entries in the database
If found will return False
'''
def checkForDuplicate(idCheck):
tablesToCheck = ['checkDuplicateMovie', 'checkDuplicateSerie', 'checkDuplicateIgnore', 'checkDuplicateRecheck']
for table in tablesToCheck:
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc(table, [idCheck,])
for results in cursor.stored_results():
result = results.fetchall()
commitDBConnection(mydb)
if len(result) > 0:
return False
'''
Remove Duplicates from the Main Tables
'''
def removeDuplicates():
# Remove Duplicate Movies
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc('getDuplicateMovies')
for results in cursor.stored_results():
result = results.fetchall()
commitDBConnection(mydb)
for row in result:
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
print("Removing duplicate:: %s" %(row[0]))
cursor.callproc('removeDuplicateMovie', [row[0], row[1],])
commitDBConnection(mydb)
# Remove Duplicate Series
# [TODO] D.R.Y
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc('getDuplicateSeries')
for results in cursor.stored_results():
result = results.fetchall()
commitDBConnection(mydb)
for row in result:
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
print("Removing duplicate:: %s" %(row[0]))
cursor.callproc('removeDuplicateSerie', [row[0], row[1],])
commitDBConnection(mydb)
def commitDBConnection(database):
database.commit()
database.close()
def saveIgnoreToDatabase(idIgnore):
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc('insertIgnore', [idIgnore,])
commitDBConnection(mydb)
def saveRecheckToDatabase(idRecheck):
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc('insertRecheck', [idRecheck,])
commitDBConnection(mydb)
'''
Function to save to the database
'''
def saveToDatabase(dataTable, inTable):
# [TODO] Change to dynamic values from docker-compose.yml
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
# Defines which procedures to call
if (inTable == 'movies'):
mainTable = 'insertMovie'
genreTable = 'insertMovieGenre'
if (inTable == 'series'):
mainTable = 'insertSerie'
genreTable = 'insertSerieGenre'
for row in dataTable:
print("Found %s" %(row[0]))
cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
try:
if len(row[4]) > 1:
for genre in row[4]:
cursor.callproc(genreTable, [row[0],genre,])
continue
except Exception as e:
cursor.callproc(genreTable, [row[0],str(row[4]),])
commitDBConnection(mydb)
'''
Main Function for the scrapper
It will prepare the URL, request it, get the answer, parse the information to a list,
append that list to another list and finally call savetoFile(dataTable, dataPath)
to save it to a file
Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
'''
def imdbscrapper(startURL, endURL):
# Configuration values for the scrapper
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
debugLevel = 40 # 20 will display Info messages, 40 errors
#logFile = "/opt/storage/info.log" # Log output
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
moviesFile = "/opt/storage/movies.csv" # Where to store movie info
seriesFile = "/opt/storage/series.csv" # Where to store shows/series info
# Initialize List of lists
movieTable = []
serieTable = []
errorTable = []
reCheckTable = []
# Go in descending order from startURL to endURL
for i in range(startURL, endURL, -1):
#logging.basicConfig(filename=logFile, level=logging.INFO)
titleFixed = str(i).zfill(9) # Adds leading zeros, so that it always has 7 digits
url = baseURL + titleFixed + '/' # String Joins every part of the URL
dataRow = [] # Initializes the dataRow list
errorRow = [] # Initializes the errorRow list
reCheckRow = [] # Initializes the reCheckRow list
# Assume Non Duplicate
testDuplicate = True
# Test for Duplicate
testDuplicate = checkForDuplicate(titleFixed)
# If a duplicate is found, skip number
if testDuplicate is False:
continue
# While made to wait if 503 code is received (too many requests)
testNext = False
while testNext == False:
try:
testNext = True
dataRow.append(titleFixed)
# Requests, parses and loads into JSON the HTML response
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
# If Error 503 is found
if len(soup.findAll(text='Error 503')) > 0:
testNext = False
print ("Did we just got 503ed? Waiting 60...")
sleep(60)
data = json.loads(soup.find('script', type='application/ld+json').string)
# If the response is a TVEpisode, just skip the number altogether
if(data['@type'] == 'TVEpisode'):
saveIgnoreToDatabase(titleFixed)
continue
# Gets the desired values from the JSON response
dataRow.append(data['name'])
try:
dataRow.append(str(data['description']).replace(';', ''))
except Exception as e:
dataRow.append("Description unavailable")
dataRow.append(url)
try:
dataRow.append(data['genre'])
except Exception as e:
dataRow.append(0)
try:
dataRow.append(data['aggregateRating']['ratingValue'])
except Exception as e:
dataRow.append(0)
try:
dataRow.append(data['aggregateRating']['ratingCount'])
except Exception as e:
dataRow.append(0)
try:
dataRow.append(data['datePublished'])
except Exception as e:
dataRow.append('1000-01-01')
# Checks if its a movie or a serie/show, and append the list to the list of lists
if(data['@type'] == 'Movie'):
movieTable.append(dataRow)
if(data['@type'] == 'TVSeries'):
serieTable.append(dataRow)
except Exception as e:
# Prepares the error string, then append the error list to the list of lists of errors
#errorMessage = titleFixed + " - " + str(e)
#errorRow.append(errorMessage)
#errorTable.append(errorRow)
# If the error is page not available, append to reCheck list (Performance improvement on rechecks)
if("NoneType" in str(e)):
# [TODO] Page 404 not implemented
# If Error 404 is found
saveRecheckToDatabase(titleFixed)
#testNext = False
#print ("Uncaught Error? Waiting 10")
#sleep(10)
#recheckString = titleFixed + "\n"
#reCheckRow.append(recheckString)
#reCheckTable.append(reCheckRow)
# Writes the list of lists to each correct file
#saveToFile(movieTable, moviesFile)
#saveToFile(serieTable, seriesFile)
#saveToFile(errorTable, logFile)
#saveToFile(reCheckTable, reCheckFile)
saveToDatabase(movieTable, 'movies')
saveToDatabase(serieTable, 'series')
def main():
pass
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,17 @@
import os
def cls():
os.system('cls' if os.name=='nt' else 'clear')
'''
Make Script non executable
'''
def main():
pass
if __name__ == "__main__":
main()

View File

@@ -9,244 +9,29 @@ import logging
import mysql.connector as mariadb
from multiprocessing import Process
from bs4 import BeautifulSoup
# Creates and returns a mariadb connection object
def createDBConnection():
mydb = mariadb.connect(
host = 'imdbdb',
user = 'root',
password = 'secret',
database = 'imdbscrapper'
)
return mydb
def cls():
os.system('cls' if os.name=='nt' else 'clear')
'''
Function to save data from lists (eg: movieTable) to file (eg: moviesFile)
'''
def saveToFile(dataTable, dataPath):
# Assume we couldn't open / write to file
sucessTest = False
while sucessTest != True:
# Try to open the file and append to it
# [ToDo:] change try position to fix duplicate entries
try:
f = open(dataPath, "a")
for row in dataTable:
# Initialize an empty Output String
outputStr = ""
# For each value in the list, add it as string to the output, separate each by ";"
for index, value in enumerate(row):
outputStr += str(value) + ";"
# Adds the new line (\n) to the string, writes to the file, prints to the screen
outputStr += "\n"
f.write(outputStr)
print(row)
f.close()
# If sucess, then file can move on
sucessTest = True
except Exception as e:
print("Retrying updating - %s - %s" % (dataPath, e))
'''
Function to check for duplicate entries in the database
If found will return False
'''
def checkForDuplicate(idCheck):
tablesToCheck = ['checkDuplicateMovie', 'checkDuplicateSerie', 'checkDuplicateIgnore', 'checkDuplicateRecheck']
for table in tablesToCheck:
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc(table, [idCheck,])
for results in cursor.stored_results():
result = results.fetchall()
commitDBConnection(mydb)
if len(result) > 0:
return False
import rsc.functions as scrapper
import rsc.helper as helper
def commitDBConnection(database):
database.commit()
database.close()
def saveIgnoreToDatabase(idIgnore):
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc('insertIgnore', [idIgnore,])
commitDBConnection(mydb)
def saveRecheckToDatabase(idRecheck):
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
cursor.callproc('insertRecheck', [idRecheck,])
commitDBConnection(mydb)
'''
Function to save to the database
'''
def saveToDatabase(dataTable, inTable):
# [TODO] Change to dynamic values from docker-compose.yml
mydb = createDBConnection()
cursor = mydb.cursor(buffered=True)
# Defines which procedures to call
if (inTable == 'movies'):
mainTable = 'insertMovie'
genreTable = 'insertMovieGenre'
if (inTable == 'series'):
mainTable = 'insertSerie'
genreTable = 'insertSerieGenre'
for row in dataTable:
print("Found %s" %(row[0]))
cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
try:
if len(row[4]) > 1:
for genre in row[4]:
cursor.callproc(genreTable, [row[0],genre,])
continue
except Exception as e:
cursor.callproc(genreTable, [row[0],str(row[4]),])
commitDBConnection(mydb)
'''
Main Function for the scrapper
It will prepare the URL, request it, get the answer, parse the information to a list,
append that list to another list and finally call savetoFile(dataTable, dataPath)
to save it to a file
Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
'''
def imdbscrapper(startURL, endURL):
# Configuration values for the scrapper
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
debugLevel = 40 # 20 will display Info messages, 40 errors
#logFile = "/opt/storage/info.log" # Log output
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
moviesFile = "/opt/storage/movies.csv" # Where to store movie info
seriesFile = "/opt/storage/series.csv" # Where to store shows/series info
# Initialize List of lists
movieTable = []
serieTable = []
errorTable = []
reCheckTable = []
# Go in descending order from startURL to endURL
for i in range(startURL, endURL, -1):
#logging.basicConfig(filename=logFile, level=logging.INFO)
titleFixed = str(i).zfill(9) # Adds leading zeros, so that it always has 7 digits
url = baseURL + titleFixed + '/' # String Joins every part of the URL
dataRow = [] # Initializes the dataRow list
errorRow = [] # Initializes the errorRow list
reCheckRow = [] # Initializes the reCheckRow list
# Assume Non Duplicate
testDuplicate = True
# Test for Duplicate
testDuplicate = checkForDuplicate(titleFixed)
# If a duplicate is found, skip number
if testDuplicate is False:
continue
# While made to wait if 503 code is received (too many requests)
testNext = False
while testNext == False:
try:
testNext = True
dataRow.append(titleFixed)
# Requests, parses and loads into JSON the HTML response
html = requests.get(url).text
soup = BeautifulSoup(html, 'html.parser')
# If Error 503 is found
if len(soup.findAll(text='Error 503')) > 0:
testNext = False
print ("Did we just got 503ed? Waiting 60...")
sleep(60)
data = json.loads(soup.find('script', type='application/ld+json').string)
# If the response is a TVEpisode, just skip the number altogether
if(data['@type'] == 'TVEpisode'):
saveIgnoreToDatabase(titleFixed)
continue
# Gets the desired values from the JSON response
dataRow.append(data['name'])
try:
dataRow.append(str(data['description']).replace(';', ''))
except Exception as e:
dataRow.append("Description unavailable")
dataRow.append(url)
try:
dataRow.append(data['genre'])
except Exception as e:
dataRow.append(0)
try:
dataRow.append(data['aggregateRating']['ratingValue'])
except Exception as e:
dataRow.append(0)
try:
dataRow.append(data['aggregateRating']['ratingCount'])
except Exception as e:
dataRow.append(0)
try:
dataRow.append(data['datePublished'])
except Exception as e:
dataRow.append('1000-01-01')
# Checks if its a movie or a serie/show, and append the list to the list of lists
if(data['@type'] == 'Movie'):
movieTable.append(dataRow)
if(data['@type'] == 'TVSeries'):
serieTable.append(dataRow)
except Exception as e:
# Prepares the error string, then append the error list to the list of lists of errors
#errorMessage = titleFixed + " - " + str(e)
#errorRow.append(errorMessage)
#errorTable.append(errorRow)
# If the error is page not available, append to reCheck list (Performance improvement on rechecks)
if("NoneType" in str(e)):
# [TODO] Page 404 not implemented
# If Error 404 is found
saveRecheckToDatabase(titleFixed)
#testNext = False
#print ("Uncaught Error? Waiting 10")
#sleep(10)
#recheckString = titleFixed + "\n"
#reCheckRow.append(recheckString)
#reCheckTable.append(reCheckRow)
# Writes the list of lists to each correct file
#saveToFile(movieTable, moviesFile)
#saveToFile(serieTable, seriesFile)
#saveToFile(errorTable, logFile)
#saveToFile(reCheckTable, reCheckFile)
saveToDatabase(movieTable, 'movies')
saveToDatabase(serieTable, 'series')
def main():
cls()
helper.cls()
#imdbscrapper(903747,903733)
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
endURL = int(os.getenv('END_URL', 0)) # Ending Number
stepUpCycle = int(os.getenv('STEPUPCYCLE', 100)) # How many numbers will be checked in each cycle
stepUpProcess = int(stepUpCycle / nrProcesses) # Divides the numbers to be checked in each cycle by the total number of processes
remDuplicates = int(os.getenv('removeDuplicates', 0)) # If 1: call removeDuplicates()
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
endURL = int(os.getenv('END_URL', 0)) # Ending Number
stepUpCycle = int(os.getenv('STEPUPCYCLE', 100)) # How many numbers will be checked in each cycle
stepUpProcess = int(stepUpCycle / nrProcesses) # Divides the numbers to be checked in each cycle by the total number of processes
# Eg: 5 Processes running each cycle. 100 Numbers each cycle. 20 Numbers per process
# Remove Duplicates on boot?
if remDuplicates == 1:
scrapper.removeDuplicates()
# Parses the Starting and ending values to temp variables as to keep track
# [ToDo] it might be unnecessary?
currentStartURL = startURL
@@ -260,7 +45,7 @@ def main():
print("%s :: Process: %s - Starting: %s - Ending: %s" % (datetime.datetime.now(), i, currentStartURL, currentEndURL))
if(currentEndURL < endURL):
currentEndURL = endURL
processes.append(Process(target=imdbscrapper,args=(currentStartURL, currentEndURL)))
processes.append(Process(target=scrapper.imdbscrapper,args=(currentStartURL, currentEndURL)))
currentStartURL -= stepUpProcess
currentEndURL -= stepUpProcess