mirror of
				https://gitlab.crans.org/mediatek/med.git
				synced 2025-11-04 09:42:11 +01:00 
			
		
		
		
	Fix authors & openlibrary scrap
This commit is contained in:
		@@ -7,6 +7,7 @@ import urllib.request
 | 
			
		||||
 | 
			
		||||
from django.forms import ModelForm
 | 
			
		||||
 | 
			
		||||
from .models import Auteur
 | 
			
		||||
from .scraper import BedetequeScraper
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@@ -30,6 +31,7 @@ class MediaAdminForm(ModelForm):
 | 
			
		||||
        # If results, then take the most accurate
 | 
			
		||||
        data = scraper.scrap_bd_info(r[0])
 | 
			
		||||
        self.cleaned_data.update(data)
 | 
			
		||||
        print(self.cleaned_data)
 | 
			
		||||
        return True
 | 
			
		||||
 | 
			
		||||
    def download_data_openlibrary(self, isbn):
 | 
			
		||||
@@ -53,6 +55,23 @@ class MediaAdminForm(ModelForm):
 | 
			
		||||
                if 'number_of_pages' in data:
 | 
			
		||||
                    self.cleaned_data['number_of_pages'] = \
 | 
			
		||||
                        data['number_of_pages']
 | 
			
		||||
                elif not self.cleaned_data['number_of_pages']:
 | 
			
		||||
                    self.cleaned_data['number_of_pages'] = 0
 | 
			
		||||
                if 'publish_date' in data:
 | 
			
		||||
                    months = ['January', 'February', "March", "April", "Mai",
 | 
			
		||||
                              "June", "July", "August", "September",
 | 
			
		||||
                              "October", "November","December"]
 | 
			
		||||
                    split = data['publish_date'].replace(',', '').split(' ')
 | 
			
		||||
                    self.cleaned_data['publish_date'] = "{}-{:02d}-{:02d}"\
 | 
			
		||||
                        .format(split[2], months.index(split[0])+1,
 | 
			
		||||
                                int(split[1]),)
 | 
			
		||||
                if 'authors' in data:
 | 
			
		||||
                    if 'author' not in self.cleaned_data:
 | 
			
		||||
                        self.cleaned_data['authors'] = list()
 | 
			
		||||
                    for author in data['authors']:
 | 
			
		||||
                        author_obj = Auteur.objects.get_or_create(
 | 
			
		||||
                            name=author['name'])[0]
 | 
			
		||||
                        self.cleaned_data['authors'].append(author_obj)
 | 
			
		||||
                return True
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
@@ -62,7 +81,6 @@ class MediaAdminForm(ModelForm):
 | 
			
		||||
        """
 | 
			
		||||
        super().clean()
 | 
			
		||||
 | 
			
		||||
        # TODO implement authors, side_identifier
 | 
			
		||||
        if "_continue" in self.request.POST:
 | 
			
		||||
            isbn = self.cleaned_data.get('isbn')
 | 
			
		||||
            if isbn:
 | 
			
		||||
@@ -72,6 +90,23 @@ class MediaAdminForm(ModelForm):
 | 
			
		||||
                    # Try with OpenLibrary
 | 
			
		||||
                    self.download_data_openlibrary(isbn)
 | 
			
		||||
 | 
			
		||||
                if self.cleaned_data['authors']:
 | 
			
		||||
                    author_name = self.cleaned_data['authors'][0].name
 | 
			
		||||
                    if ',' not in author_name and ' ' in author_name:
 | 
			
		||||
                        author_name = author_name.split(' ')[1]
 | 
			
		||||
                    side_identifier = "{:.3} {:.3}".format(
 | 
			
		||||
                        author_name.upper(),
 | 
			
		||||
                        self.cleaned_data['title'].upper(), )
 | 
			
		||||
 | 
			
		||||
                    if self.cleaned_data['subtitle']:
 | 
			
		||||
                        start = self.cleaned_data['subtitle'] \
 | 
			
		||||
                                .split(' ')[0].replace('.', '')
 | 
			
		||||
 | 
			
		||||
                        if start.isnumeric():
 | 
			
		||||
                            side_identifier += " {:0>2}".format(start, )
 | 
			
		||||
 | 
			
		||||
                    self.cleaned_data['side_identifier'] = side_identifier
 | 
			
		||||
 | 
			
		||||
        return self.cleaned_data
 | 
			
		||||
 | 
			
		||||
    def _clean_fields(self):
 | 
			
		||||
 
 | 
			
		||||
@@ -5,6 +5,8 @@ import re
 | 
			
		||||
 | 
			
		||||
import requests
 | 
			
		||||
 | 
			
		||||
from media.models import Auteur
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class BedetequeScraper:
 | 
			
		||||
    """
 | 
			
		||||
@@ -75,10 +77,6 @@ class BedetequeScraper:
 | 
			
		||||
            subtitle = subtitle.replace('<span class="numa"></span>', '')
 | 
			
		||||
            data['subtitle'] = ' '.join(subtitle.split())
 | 
			
		||||
 | 
			
		||||
        # TODO implement author
 | 
			
		||||
        # regex_author = r'author\">([^<]*)</span'
 | 
			
		||||
        # 'author': re.search(regex_author, content).group(1),
 | 
			
		||||
 | 
			
		||||
        # Get publish date
 | 
			
		||||
        search_publish_date = re.search(regex_publish_date, content)
 | 
			
		||||
        if search_publish_date:
 | 
			
		||||
@@ -92,23 +90,15 @@ class BedetequeScraper:
 | 
			
		||||
        # Get author and illustrator
 | 
			
		||||
        author = re.search(regex_author, content)
 | 
			
		||||
        if 'author' not in data:
 | 
			
		||||
            data['author'] = list()
 | 
			
		||||
            data['authors'] = list()
 | 
			
		||||
        if author:
 | 
			
		||||
            data['author'].append(author.group(1))
 | 
			
		||||
            author_obj = Auteur.objects.get_or_create(
 | 
			
		||||
                name=author.group(1))[0]
 | 
			
		||||
            data['authors'].append(author_obj)
 | 
			
		||||
        illustrator = re.search(regex_illustrator, content)
 | 
			
		||||
        if illustrator:
 | 
			
		||||
            data['author'].append(illustrator.group(1))
 | 
			
		||||
 | 
			
		||||
        author_name = data['author'][0]
 | 
			
		||||
        if ',' not in author_name and ' ' in author_name:
 | 
			
		||||
            author_name = author_name.split(' ')[1]
 | 
			
		||||
        side_identifier = "{:.3} {:.3}".format(author_name.upper(),
 | 
			
		||||
                                               data['title'].upper(),)
 | 
			
		||||
        if data['subtitle']:
 | 
			
		||||
            start = data['subtitle'].split(' ')[0].replace('.', '')
 | 
			
		||||
            print("start:", start)
 | 
			
		||||
            if start.isnumeric():
 | 
			
		||||
                side_identifier += " {:0>2}".format(start,)
 | 
			
		||||
        data['side_identifier'] = side_identifier
 | 
			
		||||
            author_obj = Auteur.objects.get_or_create(
 | 
			
		||||
                name=illustrator.group(1))[0]
 | 
			
		||||
            data['authors'].append(author_obj)
 | 
			
		||||
 | 
			
		||||
        return data
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user