mirror of
				https://code.hackerspace.pl/q3k/youtube-dl
				synced 2025-03-16 11:43:02 +00:00 
			
		
		
		
	Improve the OpenGraph regex
* Do not accept '>' between the property and content attributes. * Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).
This commit is contained in:
		
							parent
							
								
									85d61685f1
								
							
						
					
					
						commit
						ab2d524780
					
				| @ -315,13 +315,17 @@ class InfoExtractor(object): | |||||||
| 
 | 
 | ||||||
|     # Helper functions for extracting OpenGraph info |     # Helper functions for extracting OpenGraph info | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def _og_regex(prop): |     def _og_regexes(prop): | ||||||
|         return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop) |         esc_prop = re.escape(prop) | ||||||
|  |         return [ | ||||||
|  |             r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop, | ||||||
|  |             r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop, | ||||||
|  |         ] | ||||||
| 
 | 
 | ||||||
|     def _og_search_property(self, prop, html, name=None, **kargs): |     def _og_search_property(self, prop, html, name=None, **kargs): | ||||||
|         if name is None: |         if name is None: | ||||||
|             name = 'OpenGraph %s' % prop |             name = 'OpenGraph %s' % prop | ||||||
|         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) |         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) | ||||||
|         if escaped is None: |         if escaped is None: | ||||||
|             return None |             return None | ||||||
|         return unescapeHTML(escaped) |         return unescapeHTML(escaped) | ||||||
| @ -336,8 +340,8 @@ class InfoExtractor(object): | |||||||
|         return self._og_search_property('title', html, **kargs) |         return self._og_search_property('title', html, **kargs) | ||||||
| 
 | 
 | ||||||
|     def _og_search_video_url(self, html, name='video url', secure=True, **kargs): |     def _og_search_video_url(self, html, name='video url', secure=True, **kargs): | ||||||
|         regexes = [self._og_regex('video')] |         regexes = self._og_regexes('video') | ||||||
|         if secure: regexes.insert(0, self._og_regex('video:secure_url')) |         if secure: regexes = self._og_regexes('video:secure_url') + regexes | ||||||
|         return self._html_search_regex(regexes, html, name, **kargs) |         return self._html_search_regex(regexes, html, name, **kargs) | ||||||
| 
 | 
 | ||||||
|     def _rta_search(self, html): |     def _rta_search(self, html): | ||||||
|  | |||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jaime Marquínez Ferrándiz
						Jaime Marquínez Ferrándiz