mirror of
https://github.com/zebrajr/imdbscrapper.git
synced 2026-01-15 12:15:12 +00:00
Added: Remove Duplicates
This commit is contained in:
@@ -5,10 +5,11 @@ services:
|
||||
image: zebrajr/imdbscrapper:latest
|
||||
volumes:
|
||||
#- '${PWD}/src:/opt/imdbscrapper:ro'
|
||||
- './src/scrapper/:/opt/imdbscrapper'
|
||||
- './src/scrapper/:/opt/imdbscrapper:ro'
|
||||
user: 1000:1000
|
||||
environment:
|
||||
- START_URL=910000
|
||||
- removeDuplicates=1
|
||||
- START_URL=2000000
|
||||
- END_URL=0
|
||||
- STEPUPCYCLE=50
|
||||
- PROCESSES=5
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
-- https://www.phpmyadmin.net/
|
||||
--
|
||||
-- Host: imdbdb
|
||||
-- Generation Time: Jul 29, 2021 at 06:24 PM
|
||||
-- Generation Time: Aug 05, 2021 at 11:13 PM
|
||||
-- Server version: 10.6.3-MariaDB-1:10.6.3+maria~focal
|
||||
-- PHP Version: 7.4.1
|
||||
|
||||
@@ -52,6 +52,50 @@ CREATE DEFINER=`root`@`%` PROCEDURE `checkDuplicateSerie` (IN `idCheck` BIGINT(2
|
||||
WHERE series.idSerie = idCheck;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `getDuplicateMovies` () BEGIN
|
||||
SELECT movies.idMovie, movies.name
|
||||
FROM movies
|
||||
GROUP BY movies.name
|
||||
HAVING COUNT(movies.name) > 1;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `getDuplicateSeries` () BEGIN
|
||||
SELECT series.idSerie, series.name
|
||||
FROM series
|
||||
GROUP BY series.name
|
||||
HAVING COUNT(series.name) > 1;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `getMovieByName` (IN `movieName` VARCHAR(255)) BEGIN
|
||||
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
|
||||
FROM movies
|
||||
WHERE movies.name = movieName;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `getMovies` (IN `valueRating` DOUBLE, IN `valueRatingCount` BIGINT(20), IN `valueReleaseDate` DATE) BEGIN
|
||||
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
|
||||
FROM movies
|
||||
WHERE movies.rating >= valueRating
|
||||
AND movies.ratingCount >= valueRatingCount
|
||||
AND movies.releaseDate >= valueReleaseDate
|
||||
ORDER by movies.rating DESC, movies.ratingCount DESC;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `getSerieByName` (IN `serieName` VARCHAR(255)) BEGIN
|
||||
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
|
||||
FROM series
|
||||
WHERE series.name = serieName;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `getSeries` (IN `valueRating` DOUBLE, IN `valueRatingCount` BIGINT(20), IN `valueReleaseDate` DATE) BEGIN
|
||||
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
|
||||
FROM series
|
||||
WHERE series.rating >= valueRating
|
||||
AND series.ratingCount >= valueRatingCount
|
||||
AND series.releaseDate >= valueReleaseDate
|
||||
ORDER by series.rating DESC, series.ratingCount DESC;
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `insertIgnore` (IN `inIDIgnore` BIGINT(20)) BEGIN
|
||||
INSERT INTO ignoreList
|
||||
(`idIgnore`)
|
||||
@@ -88,22 +132,58 @@ CREATE DEFINER=`root`@`%` PROCEDURE `insertSerieGenre` (IN `idSerie` BIGINT(20),
|
||||
VALUES(idSerie, idGenre);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveMovieByName` (`movieName` VARCHAR(255)) BEGIN
|
||||
SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
|
||||
FROM movies
|
||||
WHERE movies.name = movieName;
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `removeDuplicateMovie` (IN `inMovie` BIGINT(20), IN `inName` VARCHAR(255)) BEGIN
|
||||
DELETE FROM movies
|
||||
WHERE movies.idMovie = inMovie;
|
||||
DELETE FROM moviesGenre
|
||||
WHERE moviesGenre.idMovie = inMovie;
|
||||
INSERT INTO duplicateMovies
|
||||
(duplicateMovies.idMovie, duplicateMovies.nameMovie)
|
||||
VALUES (inMovie, inName);
|
||||
INSERT INTO ignoreList
|
||||
(ignoreList.idIgnore)
|
||||
VALUES (inMovie);
|
||||
END$$
|
||||
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `retrieveSerieByName` (`serieName` VARCHAR(255)) BEGIN
|
||||
SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
|
||||
FROM series
|
||||
WHERE series.name = serieName;
|
||||
CREATE DEFINER=`root`@`%` PROCEDURE `removeDuplicateSerie` (IN `inSerie` BIGINT(20), IN `inName` VARCHAR(255)) BEGIN
|
||||
DELETE FROM series
|
||||
WHERE series.idSerie = inSerie;
|
||||
DELETE FROM seriesGenre
|
||||
WHERE seriesGenre.idSerie = inSerie;
|
||||
INSERT INTO duplicateSeries
|
||||
(duplicateSeries.idSerie, duplicateSeries.nameSerie)
|
||||
VALUES (inSerie, inName);
|
||||
INSERT INTO ignoreList
|
||||
(ignoreList.idIgnore)
|
||||
VALUES (inSerie);
|
||||
END$$
|
||||
|
||||
DELIMITER ;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `duplicateMovies`
|
||||
--
|
||||
|
||||
CREATE TABLE `duplicateMovies` (
|
||||
`idMovie` bigint(20) NOT NULL,
|
||||
`nameMovie` varchar(255) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `duplicateSeries`
|
||||
--
|
||||
|
||||
CREATE TABLE `duplicateSeries` (
|
||||
`idSerie` bigint(20) NOT NULL,
|
||||
`nameSerie` varchar(255) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
--
|
||||
-- Table structure for table `ignoreList`
|
||||
--
|
||||
@@ -182,6 +262,18 @@ CREATE TABLE `seriesGenre` (
|
||||
-- Indexes for dumped tables
|
||||
--
|
||||
|
||||
--
|
||||
-- Indexes for table `duplicateMovies`
|
||||
--
|
||||
ALTER TABLE `duplicateMovies`
|
||||
ADD PRIMARY KEY (`idMovie`);
|
||||
|
||||
--
|
||||
-- Indexes for table `duplicateSeries`
|
||||
--
|
||||
ALTER TABLE `duplicateSeries`
|
||||
ADD PRIMARY KEY (`idSerie`);
|
||||
|
||||
--
|
||||
-- Indexes for table `ignoreList`
|
||||
--
|
||||
|
||||
265
src/scrapper/rsc/functions.py
Normal file
265
src/scrapper/rsc/functions.py
Normal file
@@ -0,0 +1,265 @@
|
||||
import mysql.connector as mariadb
|
||||
from time import sleep
|
||||
|
||||
|
||||
# Creates and returns a mariadb connection object
|
||||
def createDBConnection():
|
||||
mydb = mariadb.connect(
|
||||
host = 'imdbdb',
|
||||
user = 'root',
|
||||
password = 'secret',
|
||||
database = 'imdbscrapper'
|
||||
)
|
||||
return mydb
|
||||
|
||||
'''
|
||||
Function to save data from lists (eg: movieTable) to file (eg: moviesFile)
|
||||
'''
|
||||
def saveToFile(dataTable, dataPath):
|
||||
# Assume we couldn't open / write to file
|
||||
sucessTest = False
|
||||
while sucessTest != True:
|
||||
# Try to open the file and append to it
|
||||
# [ToDo:] change try position to fix duplicate entries
|
||||
try:
|
||||
f = open(dataPath, "a")
|
||||
for row in dataTable:
|
||||
# Initialize an empty Output String
|
||||
outputStr = ""
|
||||
# For each value in the list, add it as string to the output, separate each by ";"
|
||||
for index, value in enumerate(row):
|
||||
outputStr += str(value) + ";"
|
||||
# Adds the new line (\n) to the string, writes to the file, prints to the screen
|
||||
outputStr += "\n"
|
||||
f.write(outputStr)
|
||||
print(row)
|
||||
f.close()
|
||||
# If sucess, then file can move on
|
||||
sucessTest = True
|
||||
except Exception as e:
|
||||
print("Retrying updating - %s - %s" % (dataPath, e))
|
||||
|
||||
'''
|
||||
Function to check for duplicate entries in the database
|
||||
If found will return False
|
||||
'''
|
||||
def checkForDuplicate(idCheck):
|
||||
tablesToCheck = ['checkDuplicateMovie', 'checkDuplicateSerie', 'checkDuplicateIgnore', 'checkDuplicateRecheck']
|
||||
for table in tablesToCheck:
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc(table, [idCheck,])
|
||||
for results in cursor.stored_results():
|
||||
result = results.fetchall()
|
||||
commitDBConnection(mydb)
|
||||
if len(result) > 0:
|
||||
return False
|
||||
|
||||
'''
|
||||
Remove Duplicates from the Main Tables
|
||||
'''
|
||||
def removeDuplicates():
|
||||
# Remove Duplicate Movies
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc('getDuplicateMovies')
|
||||
for results in cursor.stored_results():
|
||||
result = results.fetchall()
|
||||
commitDBConnection(mydb)
|
||||
for row in result:
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
print("Removing duplicate:: %s" %(row[0]))
|
||||
cursor.callproc('removeDuplicateMovie', [row[0], row[1],])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
# Remove Duplicate Series
|
||||
# [TODO] D.R.Y
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc('getDuplicateSeries')
|
||||
for results in cursor.stored_results():
|
||||
result = results.fetchall()
|
||||
commitDBConnection(mydb)
|
||||
for row in result:
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
print("Removing duplicate:: %s" %(row[0]))
|
||||
cursor.callproc('removeDuplicateSerie', [row[0], row[1],])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
|
||||
|
||||
def commitDBConnection(database):
|
||||
database.commit()
|
||||
database.close()
|
||||
|
||||
def saveIgnoreToDatabase(idIgnore):
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc('insertIgnore', [idIgnore,])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
def saveRecheckToDatabase(idRecheck):
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc('insertRecheck', [idRecheck,])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
|
||||
'''
|
||||
Function to save to the database
|
||||
'''
|
||||
def saveToDatabase(dataTable, inTable):
|
||||
# [TODO] Change to dynamic values from docker-compose.yml
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
|
||||
# Defines which procedures to call
|
||||
if (inTable == 'movies'):
|
||||
mainTable = 'insertMovie'
|
||||
genreTable = 'insertMovieGenre'
|
||||
if (inTable == 'series'):
|
||||
mainTable = 'insertSerie'
|
||||
genreTable = 'insertSerieGenre'
|
||||
|
||||
for row in dataTable:
|
||||
print("Found %s" %(row[0]))
|
||||
cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
|
||||
try:
|
||||
if len(row[4]) > 1:
|
||||
for genre in row[4]:
|
||||
cursor.callproc(genreTable, [row[0],genre,])
|
||||
continue
|
||||
except Exception as e:
|
||||
cursor.callproc(genreTable, [row[0],str(row[4]),])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
|
||||
|
||||
'''
|
||||
Main Function for the scrapper
|
||||
It will prepare the URL, request it, get the answer, parse the information to a list,
|
||||
append that list to another list and finally call savetoFile(dataTable, dataPath)
|
||||
to save it to a file
|
||||
Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
|
||||
'''
|
||||
def imdbscrapper(startURL, endURL):
|
||||
# Configuration values for the scrapper
|
||||
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
||||
debugLevel = 40 # 20 will display Info messages, 40 errors
|
||||
#logFile = "/opt/storage/info.log" # Log output
|
||||
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
||||
moviesFile = "/opt/storage/movies.csv" # Where to store movie info
|
||||
seriesFile = "/opt/storage/series.csv" # Where to store shows/series info
|
||||
|
||||
# Initialize List of lists
|
||||
movieTable = []
|
||||
serieTable = []
|
||||
errorTable = []
|
||||
reCheckTable = []
|
||||
# Go in descending order from startURL to endURL
|
||||
for i in range(startURL, endURL, -1):
|
||||
#logging.basicConfig(filename=logFile, level=logging.INFO)
|
||||
titleFixed = str(i).zfill(9) # Adds leading zeros, so that it always has 7 digits
|
||||
url = baseURL + titleFixed + '/' # String Joins every part of the URL
|
||||
dataRow = [] # Initializes the dataRow list
|
||||
errorRow = [] # Initializes the errorRow list
|
||||
reCheckRow = [] # Initializes the reCheckRow list
|
||||
|
||||
# Assume Non Duplicate
|
||||
testDuplicate = True
|
||||
# Test for Duplicate
|
||||
testDuplicate = checkForDuplicate(titleFixed)
|
||||
|
||||
# If a duplicate is found, skip number
|
||||
if testDuplicate is False:
|
||||
continue
|
||||
# While made to wait if 503 code is received (too many requests)
|
||||
testNext = False
|
||||
while testNext == False:
|
||||
try:
|
||||
testNext = True
|
||||
dataRow.append(titleFixed)
|
||||
# Requests, parses and loads into JSON the HTML response
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# If Error 503 is found
|
||||
if len(soup.findAll(text='Error 503')) > 0:
|
||||
testNext = False
|
||||
print ("Did we just got 503ed? Waiting 60...")
|
||||
sleep(60)
|
||||
|
||||
|
||||
data = json.loads(soup.find('script', type='application/ld+json').string)
|
||||
|
||||
# If the response is a TVEpisode, just skip the number altogether
|
||||
if(data['@type'] == 'TVEpisode'):
|
||||
saveIgnoreToDatabase(titleFixed)
|
||||
continue
|
||||
|
||||
# Gets the desired values from the JSON response
|
||||
dataRow.append(data['name'])
|
||||
try:
|
||||
dataRow.append(str(data['description']).replace(';', ''))
|
||||
except Exception as e:
|
||||
dataRow.append("Description unavailable")
|
||||
dataRow.append(url)
|
||||
try:
|
||||
dataRow.append(data['genre'])
|
||||
except Exception as e:
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['aggregateRating']['ratingValue'])
|
||||
except Exception as e:
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['aggregateRating']['ratingCount'])
|
||||
except Exception as e:
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['datePublished'])
|
||||
except Exception as e:
|
||||
dataRow.append('1000-01-01')
|
||||
|
||||
# Checks if its a movie or a serie/show, and append the list to the list of lists
|
||||
if(data['@type'] == 'Movie'):
|
||||
movieTable.append(dataRow)
|
||||
if(data['@type'] == 'TVSeries'):
|
||||
serieTable.append(dataRow)
|
||||
except Exception as e:
|
||||
# Prepares the error string, then append the error list to the list of lists of errors
|
||||
#errorMessage = titleFixed + " - " + str(e)
|
||||
#errorRow.append(errorMessage)
|
||||
#errorTable.append(errorRow)
|
||||
# If the error is page not available, append to reCheck list (Performance improvement on rechecks)
|
||||
|
||||
if("NoneType" in str(e)):
|
||||
# [TODO] Page 404 not implemented
|
||||
# If Error 404 is found
|
||||
saveRecheckToDatabase(titleFixed)
|
||||
#testNext = False
|
||||
#print ("Uncaught Error? Waiting 10")
|
||||
#sleep(10)
|
||||
#recheckString = titleFixed + "\n"
|
||||
#reCheckRow.append(recheckString)
|
||||
#reCheckTable.append(reCheckRow)
|
||||
|
||||
|
||||
|
||||
# Writes the list of lists to each correct file
|
||||
#saveToFile(movieTable, moviesFile)
|
||||
#saveToFile(serieTable, seriesFile)
|
||||
#saveToFile(errorTable, logFile)
|
||||
#saveToFile(reCheckTable, reCheckFile)
|
||||
saveToDatabase(movieTable, 'movies')
|
||||
saveToDatabase(serieTable, 'series')
|
||||
|
||||
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
17
src/scrapper/rsc/helper.py
Normal file
17
src/scrapper/rsc/helper.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import os
|
||||
|
||||
|
||||
def cls():
|
||||
os.system('cls' if os.name=='nt' else 'clear')
|
||||
|
||||
|
||||
|
||||
'''
|
||||
Make Script non executable
|
||||
'''
|
||||
def main():
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -9,244 +9,29 @@ import logging
|
||||
import mysql.connector as mariadb
|
||||
from multiprocessing import Process
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Creates and returns a mariadb connection object
|
||||
def createDBConnection():
|
||||
mydb = mariadb.connect(
|
||||
host = 'imdbdb',
|
||||
user = 'root',
|
||||
password = 'secret',
|
||||
database = 'imdbscrapper'
|
||||
)
|
||||
return mydb
|
||||
|
||||
def cls():
|
||||
os.system('cls' if os.name=='nt' else 'clear')
|
||||
|
||||
'''
|
||||
Function to save data from lists (eg: movieTable) to file (eg: moviesFile)
|
||||
'''
|
||||
def saveToFile(dataTable, dataPath):
|
||||
# Assume we couldn't open / write to file
|
||||
sucessTest = False
|
||||
while sucessTest != True:
|
||||
# Try to open the file and append to it
|
||||
# [ToDo:] change try position to fix duplicate entries
|
||||
try:
|
||||
f = open(dataPath, "a")
|
||||
for row in dataTable:
|
||||
# Initialize an empty Output String
|
||||
outputStr = ""
|
||||
# For each value in the list, add it as string to the output, separate each by ";"
|
||||
for index, value in enumerate(row):
|
||||
outputStr += str(value) + ";"
|
||||
# Adds the new line (\n) to the string, writes to the file, prints to the screen
|
||||
outputStr += "\n"
|
||||
f.write(outputStr)
|
||||
print(row)
|
||||
f.close()
|
||||
# If sucess, then file can move on
|
||||
sucessTest = True
|
||||
except Exception as e:
|
||||
print("Retrying updating - %s - %s" % (dataPath, e))
|
||||
|
||||
'''
|
||||
Function to check for duplicate entries in the database
|
||||
If found will return False
|
||||
'''
|
||||
def checkForDuplicate(idCheck):
|
||||
tablesToCheck = ['checkDuplicateMovie', 'checkDuplicateSerie', 'checkDuplicateIgnore', 'checkDuplicateRecheck']
|
||||
for table in tablesToCheck:
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc(table, [idCheck,])
|
||||
for results in cursor.stored_results():
|
||||
result = results.fetchall()
|
||||
commitDBConnection(mydb)
|
||||
if len(result) > 0:
|
||||
return False
|
||||
import rsc.functions as scrapper
|
||||
import rsc.helper as helper
|
||||
|
||||
|
||||
|
||||
def commitDBConnection(database):
|
||||
database.commit()
|
||||
database.close()
|
||||
|
||||
def saveIgnoreToDatabase(idIgnore):
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc('insertIgnore', [idIgnore,])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
def saveRecheckToDatabase(idRecheck):
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
cursor.callproc('insertRecheck', [idRecheck,])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
|
||||
'''
|
||||
Function to save to the database
|
||||
'''
|
||||
def saveToDatabase(dataTable, inTable):
|
||||
# [TODO] Change to dynamic values from docker-compose.yml
|
||||
mydb = createDBConnection()
|
||||
cursor = mydb.cursor(buffered=True)
|
||||
|
||||
# Defines which procedures to call
|
||||
if (inTable == 'movies'):
|
||||
mainTable = 'insertMovie'
|
||||
genreTable = 'insertMovieGenre'
|
||||
if (inTable == 'series'):
|
||||
mainTable = 'insertSerie'
|
||||
genreTable = 'insertSerieGenre'
|
||||
|
||||
for row in dataTable:
|
||||
print("Found %s" %(row[0]))
|
||||
cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
|
||||
try:
|
||||
if len(row[4]) > 1:
|
||||
for genre in row[4]:
|
||||
cursor.callproc(genreTable, [row[0],genre,])
|
||||
continue
|
||||
except Exception as e:
|
||||
cursor.callproc(genreTable, [row[0],str(row[4]),])
|
||||
commitDBConnection(mydb)
|
||||
|
||||
|
||||
'''
|
||||
Main Function for the scrapper
|
||||
It will prepare the URL, request it, get the answer, parse the information to a list,
|
||||
append that list to another list and finally call savetoFile(dataTable, dataPath)
|
||||
to save it to a file
|
||||
Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
|
||||
'''
|
||||
def imdbscrapper(startURL, endURL):
|
||||
|
||||
# Configuration values for the scrapper
|
||||
baseURL = "https://www.imdb.com/title/tt" # Base URL for each title
|
||||
debugLevel = 40 # 20 will display Info messages, 40 errors
|
||||
#logFile = "/opt/storage/info.log" # Log output
|
||||
counterFile = "/opt/storage/counter.txt" # Which ID was last scanned
|
||||
reCheckFile = "/opt/storage/recheck.txt" # Which IDs to recheck
|
||||
moviesFile = "/opt/storage/movies.csv" # Where to store movie info
|
||||
seriesFile = "/opt/storage/series.csv" # Where to store shows/series info
|
||||
|
||||
# Initialize List of lists
|
||||
movieTable = []
|
||||
serieTable = []
|
||||
errorTable = []
|
||||
reCheckTable = []
|
||||
# Go in descending order from startURL to endURL
|
||||
for i in range(startURL, endURL, -1):
|
||||
#logging.basicConfig(filename=logFile, level=logging.INFO)
|
||||
titleFixed = str(i).zfill(9) # Adds leading zeros, so that it always has 7 digits
|
||||
url = baseURL + titleFixed + '/' # String Joins every part of the URL
|
||||
dataRow = [] # Initializes the dataRow list
|
||||
errorRow = [] # Initializes the errorRow list
|
||||
reCheckRow = [] # Initializes the reCheckRow list
|
||||
|
||||
# Assume Non Duplicate
|
||||
testDuplicate = True
|
||||
# Test for Duplicate
|
||||
testDuplicate = checkForDuplicate(titleFixed)
|
||||
|
||||
# If a duplicate is found, skip number
|
||||
if testDuplicate is False:
|
||||
continue
|
||||
# While made to wait if 503 code is received (too many requests)
|
||||
testNext = False
|
||||
while testNext == False:
|
||||
try:
|
||||
testNext = True
|
||||
dataRow.append(titleFixed)
|
||||
# Requests, parses and loads into JSON the HTML response
|
||||
html = requests.get(url).text
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
# If Error 503 is found
|
||||
if len(soup.findAll(text='Error 503')) > 0:
|
||||
testNext = False
|
||||
print ("Did we just got 503ed? Waiting 60...")
|
||||
sleep(60)
|
||||
|
||||
|
||||
data = json.loads(soup.find('script', type='application/ld+json').string)
|
||||
|
||||
# If the response is a TVEpisode, just skip the number altogether
|
||||
if(data['@type'] == 'TVEpisode'):
|
||||
saveIgnoreToDatabase(titleFixed)
|
||||
continue
|
||||
|
||||
# Gets the desired values from the JSON response
|
||||
dataRow.append(data['name'])
|
||||
try:
|
||||
dataRow.append(str(data['description']).replace(';', ''))
|
||||
except Exception as e:
|
||||
dataRow.append("Description unavailable")
|
||||
dataRow.append(url)
|
||||
try:
|
||||
dataRow.append(data['genre'])
|
||||
except Exception as e:
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['aggregateRating']['ratingValue'])
|
||||
except Exception as e:
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['aggregateRating']['ratingCount'])
|
||||
except Exception as e:
|
||||
dataRow.append(0)
|
||||
try:
|
||||
dataRow.append(data['datePublished'])
|
||||
except Exception as e:
|
||||
dataRow.append('1000-01-01')
|
||||
|
||||
# Checks if its a movie or a serie/show, and append the list to the list of lists
|
||||
if(data['@type'] == 'Movie'):
|
||||
movieTable.append(dataRow)
|
||||
if(data['@type'] == 'TVSeries'):
|
||||
serieTable.append(dataRow)
|
||||
except Exception as e:
|
||||
# Prepares the error string, then append the error list to the list of lists of errors
|
||||
#errorMessage = titleFixed + " - " + str(e)
|
||||
#errorRow.append(errorMessage)
|
||||
#errorTable.append(errorRow)
|
||||
# If the error is page not available, append to reCheck list (Performance improvement on rechecks)
|
||||
|
||||
if("NoneType" in str(e)):
|
||||
# [TODO] Page 404 not implemented
|
||||
# If Error 404 is found
|
||||
saveRecheckToDatabase(titleFixed)
|
||||
#testNext = False
|
||||
#print ("Uncaught Error? Waiting 10")
|
||||
#sleep(10)
|
||||
#recheckString = titleFixed + "\n"
|
||||
#reCheckRow.append(recheckString)
|
||||
#reCheckTable.append(reCheckRow)
|
||||
|
||||
|
||||
|
||||
# Writes the list of lists to each correct file
|
||||
#saveToFile(movieTable, moviesFile)
|
||||
#saveToFile(serieTable, seriesFile)
|
||||
#saveToFile(errorTable, logFile)
|
||||
#saveToFile(reCheckTable, reCheckFile)
|
||||
saveToDatabase(movieTable, 'movies')
|
||||
saveToDatabase(serieTable, 'series')
|
||||
|
||||
def main():
|
||||
cls()
|
||||
helper.cls()
|
||||
|
||||
#imdbscrapper(903747,903733)
|
||||
|
||||
|
||||
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
|
||||
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
|
||||
endURL = int(os.getenv('END_URL', 0)) # Ending Number
|
||||
stepUpCycle = int(os.getenv('STEPUPCYCLE', 100)) # How many numbers will be checked in each cycle
|
||||
stepUpProcess = int(stepUpCycle / nrProcesses) # Divides the numbers to be checked in each cycle by the total number of processes
|
||||
remDuplicates = int(os.getenv('removeDuplicates', 0)) # If 1: call removeDuplicates()
|
||||
nrProcesses = int(os.getenv('PROCESSES', 5)) # Number of Processes to start in parallel
|
||||
startURL = int(os.getenv('START_URL', 10000000)) # Starting Number
|
||||
endURL = int(os.getenv('END_URL', 0)) # Ending Number
|
||||
stepUpCycle = int(os.getenv('STEPUPCYCLE', 100)) # How many numbers will be checked in each cycle
|
||||
stepUpProcess = int(stepUpCycle / nrProcesses) # Divides the numbers to be checked in each cycle by the total number of processes
|
||||
# Eg: 5 Processes running each cycle. 100 Numbers each cycle. 20 Numbers per process
|
||||
|
||||
# Remove Duplicates on boot?
|
||||
if remDuplicates == 1:
|
||||
scrapper.removeDuplicates()
|
||||
|
||||
# Parses the Starting and ending values to temp variables as to keep track
|
||||
# [ToDo] it might be unnecessary?
|
||||
currentStartURL = startURL
|
||||
@@ -260,7 +45,7 @@ def main():
|
||||
print("%s :: Process: %s - Starting: %s - Ending: %s" % (datetime.datetime.now(), i, currentStartURL, currentEndURL))
|
||||
if(currentEndURL < endURL):
|
||||
currentEndURL = endURL
|
||||
processes.append(Process(target=imdbscrapper,args=(currentStartURL, currentEndURL)))
|
||||
processes.append(Process(target=scrapper.imdbscrapper,args=(currentStartURL, currentEndURL)))
|
||||
currentStartURL -= stepUpProcess
|
||||
currentEndURL -= stepUpProcess
|
||||
|
||||
|
||||
Reference in New Issue
Block a user