Spaces:
Sleeping
Sleeping
| import time | |
| import logging | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from typing import Dict, Optional | |
| from urllib.parse import quote, urlparse | |
| logging.basicConfig( | |
| filename='error.log', | |
| level=logging.INFO, | |
| format='%(asctime)s | [%(levelname)s]: %(message)s', | |
| datefmt='%m-%d-%Y / %I:%M:%S %p' | |
| ) | |
| class SearchResults: | |
| def __init__(self, results): | |
| self.results = results | |
| def __str__(self): | |
| output = "" | |
| for result in self.results: | |
| output += "---\n" | |
| output += f"Title: {result.get('title', 'Title not found')}\n" | |
| output += f"Link: {result.get('link', 'Link not found')}\n" | |
| output += "---\n" | |
| return output | |
| class ReverseImageSearch: | |
| def __init__(self): | |
| self.base_url = "https://yandex.ru/images/search" | |
| self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"} | |
| self.retry_count = 3 | |
| self.retry_delay = 1 | |
| def response(self, query: str, image_url: str, max_results: int = 10, delay: int = 1) -> SearchResults: | |
| self._validate_input(query, image_url) | |
| encoded_query = quote(query) | |
| encoded_image_url = quote(image_url) | |
| url = f"{self.base_url}?q={encoded_query}&image_url={encoded_image_url}&sbisrc=cr_1_5_2" | |
| all_results = [] | |
| start_index = 0 | |
| while len(all_results) < max_results: | |
| if start_index != 0: | |
| time.sleep(delay) | |
| paginated_url = f"{url}&start={start_index}" | |
| response = self._make_request(paginated_url) | |
| if response is None: | |
| break | |
| search_results, valid_content = self._parse_search_results(response.text) | |
| if not valid_content: | |
| logging.warning("Unexpected HTML structure encountered.") | |
| break | |
| for result in search_results: | |
| if len(all_results) >= max_results: | |
| break | |
| data = self._extract_result_data(result) | |
| if data and data not in all_results: | |
| all_results.append(data) | |
| start_index += (len(all_results)-start_index) | |
| if len(all_results) == 0: | |
| logging.warning(f"No results were found for the given query: [{query}], and/or image URL: [{image_url}].") | |
| return "No results found. Please try again with a different query and/or image URL." | |
| else: | |
| return SearchResults(all_results[:max_results]) | |
| def _validate_input(self, query: str, image_url: str): | |
| if not query: | |
| raise ValueError("Query not found. Please enter a query and try again.") | |
| if not image_url: | |
| raise ValueError("Image URL not found. Please enter an image URL and try again.") | |
| if not self._validate_image_url(image_url): | |
| raise ValueError("Invalid image URL. Please enter a valid image URL and try again.") | |
| def _validate_image_url(self, url: str) -> bool: | |
| parsed_url = urlparse(url) | |
| path = parsed_url.path.lower() | |
| valid_extensions = (".jpg", ".jpeg", ".png", ".webp") | |
| return any(path.endswith(ext) for ext in valid_extensions) | |
| def _make_request(self, url: str): | |
| attempts = 0 | |
| while attempts < self.retry_count: | |
| try: | |
| response = requests.get(url, headers=self.headers) | |
| if response.headers.get('Content-Type', '').startswith('text/html'): | |
| response.raise_for_status() | |
| return response | |
| else: | |
| logging.warning("Non-HTML content received.") | |
| return None | |
| except requests.exceptions.HTTPError as http_err: | |
| logging.error(f"HTTP error occurred: {http_err}") | |
| attempts += 1 | |
| time.sleep(self.retry_delay) | |
| except Exception as err: | |
| logging.error(f"An error occurred: {err}") | |
| return None | |
| return None | |
| def _parse_search_results(self, html_content: str) -> (Optional[list], bool): | |
| try: | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| return soup.find_all('div', class_='g'), True | |
| except Exception as e: | |
| logging.error(f"Error parsing HTML content: {e}") | |
| return None, False | |
| def _extract_result_data(self, result) -> Dict: | |
| link = result.find('a', href=True)['href'] if result.find('a', href=True) else None | |
| title = result.find('h3').get_text(strip=True) if result.find('h3') else None | |
| return {"link": link, "title": title} if link and title else {} | |
| def yandex_reverse_image_search(image_url): | |
| # Simulate a user agent to avoid being blocked | |
| headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} | |
| try: | |
| response = requests.get(image_url, headers=headers) | |
| response.raise_for_status() # Raise an exception for bad status codes | |
| # Parse the HTML content | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Extract image URLs (example - adapt based on Yandex's HTML structure) | |
| image_urls = [img['src'] for img in soup.find_all('img')] | |
| # Extract related searches (example - adapt based on Yandex's HTML structure) | |
| related_searches = [text for text in soup.find_all(class_="related-searches")] | |
| return image_urls, related_searches | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching image: {e}") | |
| return [], [] | |
| if __name__ == "__main__": | |
| # request = GoogleReverseImageSearch() | |
| # response = request.response( | |
| # query="Example Query", | |
| # image_url="https://ichef.bbci.co.uk/images/ic/1024xn/p0khzhhl.jpg.webp", | |
| # max_results=5 | |
| # ) | |
| # print(response) | |
| # Path to local image | |
| image_path = "data/test_data/towel.jpg" | |
| image_path = "C:\\TTProjects\\prj-nict-ai-content-detection\\data\\test_data\\towel.jpg" | |
| import json | |
| file_path = image_path | |
| search_url = 'https://yandex.ru/images/search' | |
| files = {'upfile': ('blob', open(file_path, 'rb'), 'image/jpeg')} | |
| params = {'rpt': 'imageview', 'format': 'json', 'request': '{"blocks":[{"block":"b-page_type_search-by-image__link"}]}'} | |
| response = requests.post(search_url, params=params, files=files) | |
| query_string = json.loads(response.content)['blocks'][0]['params']['url'] | |
| img_search_url = search_url + '?' + query_string | |
| print(img_search_url) | |
| image_urls, related_searches = yandex_reverse_image_search(img_search_url) | |
| print("Image URLs:", image_urls) | |
| print("Related Searches:", related_searches) | |