Added: Remove Duplicates

2026-01-15 12:15:12 +00:00 · 2021-08-06 01:18:57 +02:00
parent e7f144a438
commit feb97c7039
5 changed files with 400 additions and 240 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -5,10 +5,11 @@ services:
    image: zebrajr/imdbscrapper:latest
    volumes:
    #- '${PWD}/src:/opt/imdbscrapper:ro'
-    - './src/scrapper/:/opt/imdbscrapper'
+    - './src/scrapper/:/opt/imdbscrapper:ro'
    user: 1000:1000
    environment:
-      - START_URL=910000
+      - removeDuplicates=1
+      - START_URL=2000000
      - END_URL=0
      - STEPUPCYCLE=50
      - PROCESSES=5
--- a/src/back/imdbscrapper.sql
+++ b/src/back/imdbscrapper.sql
@@ -3,7 +3,7 @@
 -- https://www.phpmyadmin.net/
 --
 -- Host: imdbdb
-- Generation Time: Jul 29, 2021 at 06:24 PM
+-- Generation Time: Aug 05, 2021 at 11:13 PM
 -- Server version: 10.6.3-MariaDB-1:10.6.3+maria~focal
 -- PHP Version: 7.4.1

@@ -52,6 +52,50 @@ CREATE DEFINER=`root`@`%` PROCEDURE `checkDuplicateSerie` (IN `idCheck` BIGINT(2
    WHERE series.idSerie = idCheck;
 END$$

+CREATE DEFINER=`root`@`%` PROCEDURE `getDuplicateMovies` ()  BEGIN
+	SELECT movies.idMovie, movies.name
+    FROM movies
+    GROUP BY movies.name
+    HAVING COUNT(movies.name) > 1;
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `getDuplicateSeries` ()  BEGIN
+	SELECT series.idSerie, series.name
+    FROM series
+    GROUP BY series.name
+    HAVING COUNT(series.name) > 1;
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `getMovieByName` (IN `movieName` VARCHAR(255))  BEGIN
+	SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
+    FROM movies
+    WHERE movies.name = movieName;
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `getMovies` (IN `valueRating` DOUBLE, IN `valueRatingCount` BIGINT(20), IN `valueReleaseDate` DATE)  BEGIN
+		SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
+    FROM movies
+    WHERE movies.rating >= valueRating
+    AND movies.ratingCount >= valueRatingCount
+    AND movies.releaseDate  >= valueReleaseDate
+    ORDER by movies.rating DESC, movies.ratingCount DESC;
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `getSerieByName` (IN `serieName` VARCHAR(255))  BEGIN
+	SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
+    FROM series
+    WHERE series.name = serieName;
+END$$
+
+CREATE DEFINER=`root`@`%` PROCEDURE `getSeries` (IN `valueRating` DOUBLE, IN `valueRatingCount` BIGINT(20), IN `valueReleaseDate` DATE)  BEGIN
+	SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
+    FROM series
+    WHERE series.rating >= valueRating
+    AND series.ratingCount >= valueRatingCount
+    AND series.releaseDate  >= valueReleaseDate
+    ORDER by series.rating DESC, series.ratingCount DESC;
+END$$
+
 CREATE DEFINER=`root`@`%` PROCEDURE `insertIgnore` (IN `inIDIgnore` BIGINT(20))  BEGIN
 	INSERT INTO ignoreList
    (`idIgnore`)
@@ -88,22 +132,58 @@ CREATE DEFINER=`root`@`%` PROCEDURE `insertSerieGenre` (IN `idSerie` BIGINT(20),
    VALUES(idSerie, idGenre);
 END$$

-CREATE DEFINER=`root`@`%` PROCEDURE `retrieveMovieByName` (`movieName` VARCHAR(255))  BEGIN
-	SELECT movies.idMovie, movies.name, movies.description, movies.imdbURL, movies.rating, movies.ratingCount, movies.releaseDate
-    FROM movies
-    WHERE movies.name = movieName;
+CREATE DEFINER=`root`@`%` PROCEDURE `removeDuplicateMovie` (IN `inMovie` BIGINT(20), IN `inName` VARCHAR(255))  BEGIN
+	DELETE FROM movies
+    WHERE movies.idMovie = inMovie;
+    DELETE FROM moviesGenre
+    WHERE moviesGenre.idMovie = inMovie;
+    INSERT INTO duplicateMovies
+    (duplicateMovies.idMovie, duplicateMovies.nameMovie)
+    VALUES (inMovie, inName);
+    INSERT INTO ignoreList
+    (ignoreList.idIgnore)
+    VALUES (inMovie);
 END$$

-CREATE DEFINER=`root`@`%` PROCEDURE `retrieveSerieByName` (`serieName` VARCHAR(255))  BEGIN
-	SELECT series.idSerie, series.name, series.description, series.imdbURL, series.rating, series.ratingCount, series.releaseDate
-    FROM series
-    WHERE series.name = serieName;
+CREATE DEFINER=`root`@`%` PROCEDURE `removeDuplicateSerie` (IN `inSerie` BIGINT(20), IN `inName` VARCHAR(255))  BEGIN
+	DELETE FROM series
+    WHERE series.idSerie = inSerie;
+    DELETE FROM seriesGenre
+    WHERE seriesGenre.idSerie = inSerie;
+    INSERT INTO duplicateSeries
+    (duplicateSeries.idSerie, duplicateSeries.nameSerie)
+    VALUES (inSerie, inName);
+    INSERT INTO ignoreList
+    (ignoreList.idIgnore)
+    VALUES (inSerie);
 END$$

 DELIMITER ;

 -- --------------------------------------------------------

+--
+-- Table structure for table `duplicateMovies`
+--
+
+CREATE TABLE `duplicateMovies` (
+  `idMovie` bigint(20) NOT NULL,
+  `nameMovie` varchar(255) NOT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
+--
+-- Table structure for table `duplicateSeries`
+--
+
+CREATE TABLE `duplicateSeries` (
+  `idSerie` bigint(20) NOT NULL,
+  `nameSerie` varchar(255) NOT NULL
+) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
+
+-- --------------------------------------------------------
+
 --
 -- Table structure for table `ignoreList`
 --
@@ -182,6 +262,18 @@ CREATE TABLE `seriesGenre` (
 -- Indexes for dumped tables
 --

+--
+-- Indexes for table `duplicateMovies`
+--
+ALTER TABLE `duplicateMovies`
+  ADD PRIMARY KEY (`idMovie`);
+
+--
+-- Indexes for table `duplicateSeries`
+--
+ALTER TABLE `duplicateSeries`
+  ADD PRIMARY KEY (`idSerie`);
+
 --
 -- Indexes for table `ignoreList`
 --
--- a/src/scrapper/rsc/functions.py
+++ b/src/scrapper/rsc/functions.py
@@ -0,0 +1,265 @@
+import mysql.connector as mariadb
+from time import sleep
+
+
+# Creates and returns a mariadb connection object
+def createDBConnection():
+    mydb = mariadb.connect(
+        host = 'imdbdb',
+        user = 'root',
+        password = 'secret',
+        database = 'imdbscrapper'
+    )
+    return mydb
+
+'''
+    Function to save data from lists (eg: movieTable) to file (eg: moviesFile)
+'''
+def saveToFile(dataTable, dataPath):
+    # Assume we couldn't open / write to file
+    sucessTest = False
+    while sucessTest != True:
+        # Try to open the file and append to it
+        # [ToDo:] change try position to fix duplicate entries
+        try:
+            f = open(dataPath, "a")
+            for row in dataTable:
+                # Initialize an empty Output String
+                outputStr = ""
+                # For each value in the list, add it as string to the output, separate each by ";"
+                for index, value in enumerate(row):
+                    outputStr += str(value) + ";"
+                # Adds the new line (\n) to the string, writes to the file, prints to the screen
+                outputStr += "\n"
+                f.write(outputStr)
+                print(row)
+            f.close()
+            # If sucess, then file can move on
+            sucessTest = True
+        except Exception as e:
+            print("Retrying updating - %s - %s" % (dataPath, e))
+
+'''
+    Function to check for duplicate entries in the database
+    If found will return False
+'''
+def checkForDuplicate(idCheck):
+    tablesToCheck = ['checkDuplicateMovie', 'checkDuplicateSerie', 'checkDuplicateIgnore', 'checkDuplicateRecheck']
+    for table in tablesToCheck:
+        mydb = createDBConnection()
+        cursor = mydb.cursor(buffered=True)
+        cursor.callproc(table, [idCheck,])
+        for results in cursor.stored_results():
+            result = results.fetchall()
+        commitDBConnection(mydb)
+        if len(result) > 0:
+            return False
+
+'''
+    Remove Duplicates from the Main Tables
+'''
+def removeDuplicates():
+    # Remove Duplicate Movies
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+    cursor.callproc('getDuplicateMovies')
+    for results in cursor.stored_results():
+        result = results.fetchall()
+    commitDBConnection(mydb)
+    for row in result:
+        mydb = createDBConnection()
+        cursor = mydb.cursor(buffered=True)
+        print("Removing duplicate:: %s" %(row[0]))
+        cursor.callproc('removeDuplicateMovie', [row[0], row[1],])
+        commitDBConnection(mydb)
+
+    # Remove Duplicate Series
+    # [TODO] D.R.Y
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+    cursor.callproc('getDuplicateSeries')
+    for results in cursor.stored_results():
+        result = results.fetchall()
+    commitDBConnection(mydb)
+    for row in result:
+        mydb = createDBConnection()
+        cursor = mydb.cursor(buffered=True)
+        print("Removing duplicate:: %s" %(row[0]))
+        cursor.callproc('removeDuplicateSerie', [row[0], row[1],])
+        commitDBConnection(mydb)
+
+
+
+def commitDBConnection(database):
+    database.commit()
+    database.close()
+
+def saveIgnoreToDatabase(idIgnore):
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+    cursor.callproc('insertIgnore', [idIgnore,])
+    commitDBConnection(mydb)
+
+def saveRecheckToDatabase(idRecheck):
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+    cursor.callproc('insertRecheck', [idRecheck,])
+    commitDBConnection(mydb)
+
+
+'''
+    Function to save to the database
+'''
+def saveToDatabase(dataTable, inTable):
+    # [TODO] Change to dynamic values from docker-compose.yml
+    mydb = createDBConnection()
+    cursor = mydb.cursor(buffered=True)
+
+    # Defines which procedures to call
+    if (inTable == 'movies'):
+        mainTable = 'insertMovie'
+        genreTable = 'insertMovieGenre'
+    if (inTable == 'series'):
+        mainTable = 'insertSerie'
+        genreTable = 'insertSerieGenre'
+
+    for row in dataTable:
+        print("Found %s" %(row[0]))
+        cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
+        try:
+            if len(row[4]) > 1:
+                for genre in row[4]:
+                    cursor.callproc(genreTable, [row[0],genre,])
+                continue
+        except Exception as e:
+            cursor.callproc(genreTable, [row[0],str(row[4]),])
+    commitDBConnection(mydb)
+
+
+
+'''
+    Main Function for the scrapper
+    It will prepare the URL, request it, get the answer, parse the information to a list,
+        append that list to another list and finally call savetoFile(dataTable, dataPath)
+        to save it to a file
+    Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
+'''
+def imdbscrapper(startURL, endURL):
+    # Configuration values for the scrapper
+    baseURL         = "https://www.imdb.com/title/tt"       # Base URL for each title
+    debugLevel      = 40                                    # 20 will display Info messages, 40 errors
+    #logFile         = "/opt/storage/info.log"               # Log output
+    counterFile     = "/opt/storage/counter.txt"            # Which ID was last scanned
+    reCheckFile     = "/opt/storage/recheck.txt"            # Which IDs to recheck
+    moviesFile      = "/opt/storage/movies.csv"             # Where to store movie info
+    seriesFile      = "/opt/storage/series.csv"             # Where to store shows/series info
+
+    # Initialize List of lists
+    movieTable   = []
+    serieTable   = []
+    errorTable   = []
+    reCheckTable = []
+    # Go in descending order from startURL to endURL
+    for i in range(startURL, endURL, -1):
+        #logging.basicConfig(filename=logFile, level=logging.INFO)
+        titleFixed = str(i).zfill(9)               # Adds leading zeros, so that it always has 7 digits
+        url        = baseURL + titleFixed + '/'    # String Joins every part of the URL
+        dataRow    = []                            # Initializes the dataRow list
+        errorRow   = []                            # Initializes the errorRow list
+        reCheckRow = []                            # Initializes the reCheckRow list
+
+        # Assume Non Duplicate
+        testDuplicate = True
+        # Test for Duplicate
+        testDuplicate = checkForDuplicate(titleFixed)
+
+        # If a duplicate is found, skip number
+        if testDuplicate is False:
+            continue
+        # While made to wait if 503 code is received (too many requests)
+        testNext = False
+        while testNext == False:
+            try:
+                testNext = True
+                dataRow.append(titleFixed)
+                # Requests, parses and loads into JSON the HTML response
+                html = requests.get(url).text
+                soup = BeautifulSoup(html, 'html.parser')
+                # If Error 503 is found
+                if len(soup.findAll(text='Error 503')) > 0:
+                    testNext = False
+                    print ("Did we just got 503ed? Waiting 60...")
+                    sleep(60)
+
+
+                data = json.loads(soup.find('script', type='application/ld+json').string)
+
+                # If the response is a TVEpisode, just skip the number altogether
+                if(data['@type'] == 'TVEpisode'):
+                    saveIgnoreToDatabase(titleFixed)
+                    continue
+
+                # Gets the desired values from the JSON response
+                dataRow.append(data['name'])
+                try:
+                    dataRow.append(str(data['description']).replace(';', ''))
+                except Exception as e:
+                    dataRow.append("Description unavailable")
+                dataRow.append(url)
+                try:
+                    dataRow.append(data['genre'])
+                except Exception as e:
+                    dataRow.append(0)
+                try:
+                    dataRow.append(data['aggregateRating']['ratingValue'])
+                except Exception as e:
+                    dataRow.append(0)
+                try:
+                    dataRow.append(data['aggregateRating']['ratingCount'])
+                except Exception as e:
+                    dataRow.append(0)
+                try:
+                    dataRow.append(data['datePublished'])
+                except Exception as e:
+                    dataRow.append('1000-01-01')
+
+                # Checks if its a movie or a serie/show, and append the list to the list of lists
+                if(data['@type'] == 'Movie'):
+                    movieTable.append(dataRow)
+                if(data['@type'] == 'TVSeries'):
+                    serieTable.append(dataRow)
+            except Exception as e:
+                # Prepares the error string, then append the error list to the list of lists of errors
+                #errorMessage = titleFixed + " - " + str(e)
+                #errorRow.append(errorMessage)
+                #errorTable.append(errorRow)
+                # If the error is page not available, append to reCheck list (Performance improvement on rechecks)
+
+                if("NoneType" in str(e)):
+                    # [TODO] Page 404 not implemented
+                    # If Error 404 is found
+                    saveRecheckToDatabase(titleFixed)
+                    #testNext = False
+                    #print ("Uncaught Error? Waiting 10")
+                    #sleep(10)
+                    #recheckString = titleFixed + "\n"
+                    #reCheckRow.append(recheckString)
+                    #reCheckTable.append(reCheckRow)
+
+
+
+    # Writes the list of lists to each correct file
+    #saveToFile(movieTable, moviesFile)
+    #saveToFile(serieTable, seriesFile)
+    #saveToFile(errorTable, logFile)
+    #saveToFile(reCheckTable, reCheckFile)
+    saveToDatabase(movieTable, 'movies')
+    saveToDatabase(serieTable, 'series')
+
+
+def main():
+    pass
+
+
+if __name__ == "__main__":
+    main()
--- a/src/scrapper/rsc/helper.py
+++ b/src/scrapper/rsc/helper.py
@@ -0,0 +1,17 @@
+import os
+
+
+def cls():
+    os.system('cls' if os.name=='nt' else 'clear')
+
+
+
+'''
+    Make Script non executable
+'''
+def main():
+    pass
+
+
+if __name__ == "__main__":
+    main()
--- a/src/scrapper/scrapper.py
+++ b/src/scrapper/scrapper.py
@@ -9,244 +9,29 @@ import logging
 import mysql.connector as mariadb
 from multiprocessing import Process
 from bs4 import BeautifulSoup
-
-# Creates and returns a mariadb connection object
-def createDBConnection():
-    mydb = mariadb.connect(
-        host = 'imdbdb',
-        user = 'root',
-        password = 'secret',
-        database = 'imdbscrapper'
-    )
-    return mydb
-
-def cls():
-    os.system('cls' if os.name=='nt' else 'clear')
-
-'''
-    Function to save data from lists (eg: movieTable) to file (eg: moviesFile)
-'''
-def saveToFile(dataTable, dataPath):
-    # Assume we couldn't open / write to file
-    sucessTest = False
-    while sucessTest != True:
-        # Try to open the file and append to it
-        # [ToDo:] change try position to fix duplicate entries
-        try:
-            f = open(dataPath, "a")
-            for row in dataTable:
-                # Initialize an empty Output String
-                outputStr = ""
-                # For each value in the list, add it as string to the output, separate each by ";"
-                for index, value in enumerate(row):
-                    outputStr += str(value) + ";"
-                # Adds the new line (\n) to the string, writes to the file, prints to the screen
-                outputStr += "\n"
-                f.write(outputStr)
-                print(row)
-            f.close()
-            # If sucess, then file can move on
-            sucessTest = True
-        except Exception as e:
-            print("Retrying updating - %s - %s" % (dataPath, e))
-
-'''
-    Function to check for duplicate entries in the database
-    If found will return False
-'''
-def checkForDuplicate(idCheck):
-    tablesToCheck = ['checkDuplicateMovie', 'checkDuplicateSerie', 'checkDuplicateIgnore', 'checkDuplicateRecheck']
-    for table in tablesToCheck:
-        mydb = createDBConnection()
-        cursor = mydb.cursor(buffered=True)
-        cursor.callproc(table, [idCheck,])
-        for results in cursor.stored_results():
-            result = results.fetchall()
-        commitDBConnection(mydb)
-        if len(result) > 0:
-            return False
+import rsc.functions as scrapper
+import rsc.helper as helper



-def commitDBConnection(database):
-    database.commit()
-    database.close()
-
-def saveIgnoreToDatabase(idIgnore):
-    mydb = createDBConnection()
-    cursor = mydb.cursor(buffered=True)
-    cursor.callproc('insertIgnore', [idIgnore,])
-    commitDBConnection(mydb)
-
-def saveRecheckToDatabase(idRecheck):
-    mydb = createDBConnection()
-    cursor = mydb.cursor(buffered=True)
-    cursor.callproc('insertRecheck', [idRecheck,])
-    commitDBConnection(mydb)
-
-
-'''
-    Function to save to the database
-'''
-def saveToDatabase(dataTable, inTable):
-    # [TODO] Change to dynamic values from docker-compose.yml
-    mydb = createDBConnection()
-    cursor = mydb.cursor(buffered=True)
-
-    # Defines which procedures to call
-    if (inTable == 'movies'):
-        mainTable = 'insertMovie'
-        genreTable = 'insertMovieGenre'
-    if (inTable == 'series'):
-        mainTable = 'insertSerie'
-        genreTable = 'insertSerieGenre'
-
-    for row in dataTable:
-        print("Found %s" %(row[0]))
-        cursor.callproc(mainTable, [row[0],row[1],row[2],row[3],row[5],row[6],row[7],])
-        try:
-            if len(row[4]) > 1:
-                for genre in row[4]:
-                    cursor.callproc(genreTable, [row[0],genre,])
-                continue
-        except Exception as e:
-            cursor.callproc(genreTable, [row[0],str(row[4]),])
-    commitDBConnection(mydb)
-
-
-'''
-    Main Function for the scrapper
-    It will prepare the URL, request it, get the answer, parse the information to a list,
-        append that list to another list and finally call savetoFile(dataTable, dataPath)
-        to save it to a file
-    Requires a starting and an ending URL (in Int), going in decreasing order (eg: 1000, 999, 998, etc)
-'''
-def imdbscrapper(startURL, endURL):
-
-    # Configuration values for the scrapper
-    baseURL         = "https://www.imdb.com/title/tt"       # Base URL for each title
-    debugLevel      = 40                                    # 20 will display Info messages, 40 errors
-    #logFile         = "/opt/storage/info.log"               # Log output
-    counterFile     = "/opt/storage/counter.txt"            # Which ID was last scanned
-    reCheckFile     = "/opt/storage/recheck.txt"            # Which IDs to recheck
-    moviesFile      = "/opt/storage/movies.csv"             # Where to store movie info
-    seriesFile      = "/opt/storage/series.csv"             # Where to store shows/series info
-
-    # Initialize List of lists
-    movieTable   = []
-    serieTable   = []
-    errorTable   = []
-    reCheckTable = []
-    # Go in descending order from startURL to endURL
-    for i in range(startURL, endURL, -1):
-        #logging.basicConfig(filename=logFile, level=logging.INFO)
-        titleFixed = str(i).zfill(9)               # Adds leading zeros, so that it always has 7 digits
-        url        = baseURL + titleFixed + '/'    # String Joins every part of the URL
-        dataRow    = []                            # Initializes the dataRow list
-        errorRow   = []                            # Initializes the errorRow list
-        reCheckRow = []                            # Initializes the reCheckRow list
-
-        # Assume Non Duplicate
-        testDuplicate = True
-        # Test for Duplicate
-        testDuplicate = checkForDuplicate(titleFixed)
-
-        # If a duplicate is found, skip number
-        if testDuplicate is False:
-            continue
-        # While made to wait if 503 code is received (too many requests)
-        testNext = False
-        while testNext == False:
-            try:
-                testNext = True
-                dataRow.append(titleFixed)
-                # Requests, parses and loads into JSON the HTML response
-                html = requests.get(url).text
-                soup = BeautifulSoup(html, 'html.parser')
-                # If Error 503 is found
-                if len(soup.findAll(text='Error 503')) > 0:
-                    testNext = False
-                    print ("Did we just got 503ed? Waiting 60...")
-                    sleep(60)
-
-
-                data = json.loads(soup.find('script', type='application/ld+json').string)
-
-                # If the response is a TVEpisode, just skip the number altogether
-                if(data['@type'] == 'TVEpisode'):
-                    saveIgnoreToDatabase(titleFixed)
-                    continue
-
-                # Gets the desired values from the JSON response
-                dataRow.append(data['name'])
-                try:
-                    dataRow.append(str(data['description']).replace(';', ''))
-                except Exception as e:
-                    dataRow.append("Description unavailable")
-                dataRow.append(url)
-                try:
-                    dataRow.append(data['genre'])
-                except Exception as e:
-                    dataRow.append(0)
-                try:
-                    dataRow.append(data['aggregateRating']['ratingValue'])
-                except Exception as e:
-                    dataRow.append(0)
-                try:
-                    dataRow.append(data['aggregateRating']['ratingCount'])
-                except Exception as e:
-                    dataRow.append(0)
-                try:
-                    dataRow.append(data['datePublished'])
-                except Exception as e:
-                    dataRow.append('1000-01-01')
-
-                # Checks if its a movie or a serie/show, and append the list to the list of lists
-                if(data['@type'] == 'Movie'):
-                    movieTable.append(dataRow)
-                if(data['@type'] == 'TVSeries'):
-                    serieTable.append(dataRow)
-            except Exception as e:
-                # Prepares the error string, then append the error list to the list of lists of errors
-                #errorMessage = titleFixed + " - " + str(e)
-                #errorRow.append(errorMessage)
-                #errorTable.append(errorRow)
-                # If the error is page not available, append to reCheck list (Performance improvement on rechecks)
-
-                if("NoneType" in str(e)):
-                    # [TODO] Page 404 not implemented
-                    # If Error 404 is found
-                    saveRecheckToDatabase(titleFixed)
-                    #testNext = False
-                    #print ("Uncaught Error? Waiting 10")
-                    #sleep(10)
-                    #recheckString = titleFixed + "\n"
-                    #reCheckRow.append(recheckString)
-                    #reCheckTable.append(reCheckRow)
-
-
-
-    # Writes the list of lists to each correct file
-    #saveToFile(movieTable, moviesFile)
-    #saveToFile(serieTable, seriesFile)
-    #saveToFile(errorTable, logFile)
-    #saveToFile(reCheckTable, reCheckFile)
-    saveToDatabase(movieTable, 'movies')
-    saveToDatabase(serieTable, 'series')
-
 def main():
-    cls()
+    helper.cls()

    #imdbscrapper(903747,903733)


-    nrProcesses     = int(os.getenv('PROCESSES', 5))         # Number of Processes to start in parallel
-    startURL        = int(os.getenv('START_URL', 10000000))  # Starting Number
-    endURL          = int(os.getenv('END_URL', 0))           # Ending Number
-    stepUpCycle     = int(os.getenv('STEPUPCYCLE', 100))     # How many numbers will be checked in each cycle
-    stepUpProcess   = int(stepUpCycle / nrProcesses)    # Divides the numbers to be checked in each cycle by the total number of processes
+    remDuplicates       = int(os.getenv('removeDuplicates', 0))     # If 1: call removeDuplicates()
+    nrProcesses         = int(os.getenv('PROCESSES', 5))            # Number of Processes to start in parallel
+    startURL            = int(os.getenv('START_URL', 10000000))     # Starting Number
+    endURL              = int(os.getenv('END_URL', 0))              # Ending Number
+    stepUpCycle         = int(os.getenv('STEPUPCYCLE', 100))        # How many numbers will be checked in each cycle
+    stepUpProcess       = int(stepUpCycle / nrProcesses)    # Divides the numbers to be checked in each cycle by the total number of processes
                                                        # Eg: 5 Processes running each cycle. 100 Numbers each cycle. 20 Numbers per process

+    # Remove Duplicates on boot?
+    if remDuplicates == 1:
+        scrapper.removeDuplicates()
+
    # Parses the Starting and ending values to temp variables as to keep track
    # [ToDo] it might be unnecessary?
    currentStartURL = startURL
@@ -260,7 +45,7 @@ def main():
            print("%s :: Process: %s - Starting: %s - Ending: %s" % (datetime.datetime.now(), i, currentStartURL, currentEndURL))
            if(currentEndURL < endURL):
                currentEndURL = endURL
-            processes.append(Process(target=imdbscrapper,args=(currentStartURL, currentEndURL)))
+            processes.append(Process(target=scrapper.imdbscrapper,args=(currentStartURL, currentEndURL)))
            currentStartURL -= stepUpProcess
            currentEndURL   -= stepUpProcess