Fix authors & openlibrary scrap

2025-11-04 09:42:11 +01:00 · 2020-02-10 11:29:26 +01:00
parent c7d804d9bf
commit 7fd8e92371
2 changed files with 45 additions and 20 deletions
--- a/media/forms.py
+++ b/media/forms.py
@@ -7,6 +7,7 @@ import urllib.request

 from django.forms import ModelForm

+from .models import Auteur
 from .scraper import BedetequeScraper


@@ -30,6 +31,7 @@ class MediaAdminForm(ModelForm):
        # If results, then take the most accurate
        data = scraper.scrap_bd_info(r[0])
        self.cleaned_data.update(data)
+        print(self.cleaned_data)
        return True

    def download_data_openlibrary(self, isbn):
@@ -53,6 +55,23 @@ class MediaAdminForm(ModelForm):
                if 'number_of_pages' in data:
                    self.cleaned_data['number_of_pages'] = \
                        data['number_of_pages']
+                elif not self.cleaned_data['number_of_pages']:
+                    self.cleaned_data['number_of_pages'] = 0
+                if 'publish_date' in data:
+                    months = ['January', 'February', "March", "April", "Mai",
+                              "June", "July", "August", "September",
+                              "October", "November","December"]
+                    split = data['publish_date'].replace(',', '').split(' ')
+                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}"\
+                        .format(split[2], months.index(split[0])+1,
+                                int(split[1]),)
+                if 'authors' in data:
+                    if 'author' not in self.cleaned_data:
+                        self.cleaned_data['authors'] = list()
+                    for author in data['authors']:
+                        author_obj = Auteur.objects.get_or_create(
+                            name=author['name'])[0]
+                        self.cleaned_data['authors'].append(author_obj)
                return True
        return False

@@ -62,7 +81,6 @@ class MediaAdminForm(ModelForm):
        """
        super().clean()

-        # TODO implement authors, side_identifier
        if "_continue" in self.request.POST:
            isbn = self.cleaned_data.get('isbn')
            if isbn:
@@ -72,6 +90,23 @@ class MediaAdminForm(ModelForm):
                    # Try with OpenLibrary
                    self.download_data_openlibrary(isbn)

+                if self.cleaned_data['authors']:
+                    author_name = self.cleaned_data['authors'][0].name
+                    if ',' not in author_name and ' ' in author_name:
+                        author_name = author_name.split(' ')[1]
+                    side_identifier = "{:.3} {:.3}".format(
+                        author_name.upper(),
+                        self.cleaned_data['title'].upper(), )
+
+                    if self.cleaned_data['subtitle']:
+                        start = self.cleaned_data['subtitle'] \
+                                .split(' ')[0].replace('.', '')
+
+                        if start.isnumeric():
+                            side_identifier += " {:0>2}".format(start, )
+
+                    self.cleaned_data['side_identifier'] = side_identifier
+
        return self.cleaned_data

    def _clean_fields(self):
--- a/media/scraper.py
+++ b/media/scraper.py
@@ -5,6 +5,8 @@ import re

 import requests

+from media.models import Auteur
+

 class BedetequeScraper:
    """
@@ -75,10 +77,6 @@ class BedetequeScraper:
            subtitle = subtitle.replace('<span class="numa"></span>', '')
            data['subtitle'] = ' '.join(subtitle.split())

-        # TODO implement author
-        # regex_author = r'author\">([^<]*)</span'
-        # 'author': re.search(regex_author, content).group(1),
-
        # Get publish date
        search_publish_date = re.search(regex_publish_date, content)
        if search_publish_date:
@@ -92,23 +90,15 @@ class BedetequeScraper:
        # Get author and illustrator
        author = re.search(regex_author, content)
        if 'author' not in data:
-            data['author'] = list()
+            data['authors'] = list()
        if author:
-            data['author'].append(author.group(1))
+            author_obj = Auteur.objects.get_or_create(
+                name=author.group(1))[0]
+            data['authors'].append(author_obj)
        illustrator = re.search(regex_illustrator, content)
        if illustrator:
-            data['author'].append(illustrator.group(1))
-
-        author_name = data['author'][0]
-        if ',' not in author_name and ' ' in author_name:
-            author_name = author_name.split(' ')[1]
-        side_identifier = "{:.3} {:.3}".format(author_name.upper(),
-                                               data['title'].upper(),)
-        if data['subtitle']:
-            start = data['subtitle'].split(' ')[0].replace('.', '')
-            print("start:", start)
-            if start.isnumeric():
-                side_identifier += " {:0>2}".format(start,)
-        data['side_identifier'] = side_identifier
+            author_obj = Auteur.objects.get_or_create(
+                name=illustrator.group(1))[0]
+            data['authors'].append(author_obj)

        return data