mirror of
				https://gitlab.crans.org/mediatek/med.git
				synced 2025-11-04 09:42:11 +01:00 
			
		
		
		
	Scrap author and illustrator (may not work for some books)
This commit is contained in:
		@@ -56,6 +56,8 @@ class BedetequeScraper:
 | 
			
		||||
        regex_subtitle = r'<h2>\s*(.*)</h2>'
 | 
			
		||||
        regex_publish_date = r'datePublished\" content=\"([\d-]*)\">'
 | 
			
		||||
        regex_nb_of_pages = r'numberOfPages\">(\d*)</span'
 | 
			
		||||
        regex_author = r'<span itemprop=\"author\">(((?!<).)*)</span>'
 | 
			
		||||
        regex_illustrator = r'<span itemprop=\"illustrator\">(((?!<).)*)</span>'
 | 
			
		||||
 | 
			
		||||
        data = {
 | 
			
		||||
            'external_url': bd_url,
 | 
			
		||||
@@ -87,4 +89,16 @@ class BedetequeScraper:
 | 
			
		||||
        if search_nb_pages and search_nb_pages.group(1).isnumeric():
 | 
			
		||||
            data['number_of_pages'] = search_nb_pages.group(1)
 | 
			
		||||
 | 
			
		||||
        # Get author and illustrator
 | 
			
		||||
        author = re.search(regex_author, content)
 | 
			
		||||
        if not 'author' in data:
 | 
			
		||||
            data['author'] = list()
 | 
			
		||||
        if author:
 | 
			
		||||
            data['author'].append(author.group(1))
 | 
			
		||||
        illustrator = re.search(regex_illustrator, content)
 | 
			
		||||
        if illustrator:
 | 
			
		||||
            data['author'].append(illustrator.group(1))
 | 
			
		||||
 | 
			
		||||
        print(data)
 | 
			
		||||
 | 
			
		||||
        return data
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user