mgbam commited on
Commit
53e6ab1
·
verified ·
1 Parent(s): c04089b

Rename extractor.py to search_replace.py

Browse files
Files changed (2) hide show
  1. extractor.py +0 -55
  2. search_replace.py +90 -0
extractor.py DELETED
@@ -1,55 +0,0 @@
1
- # /extractor.py
2
- """ Handles content extraction from various sources like files, images, and websites. """
3
- import mimetypes, os, re, logging
4
- from urllib.parse import urljoin
5
- import PyPDF2, docx, requests
6
- from bs4 import BeautifulSoup
7
-
8
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
-
10
- try:
11
- import cv2, pytesseract
12
- OCR_AVAILABLE = True
13
- except ImportError:
14
- OCR_AVAILABLE = False
15
- logging.warning("OCR libraries not found. Text extraction from images will be disabled.")
16
-
17
- def extract_text_from_image(image_path: str) -> str:
18
- if not OCR_AVAILABLE: return "Error: OCR dependencies not installed."
19
- try:
20
- image = cv2.imread(image_path)
21
- gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
22
- return pytesseract.image_to_string(gray) or "No text found in image."
23
- except Exception as e: return f"Error during OCR: {e}"
24
-
25
- def extract_text_from_file(file_path: str) -> str:
26
- if not file_path: return ""
27
- ext = os.path.splitext(file_path)[1].lower()
28
- try:
29
- if ext == ".pdf":
30
- with open(file_path, "rb") as f: return "\n".join(p.extract_text() or "" for p in PyPDF2.PdfReader(f).pages)
31
- elif ext == ".docx":
32
- return "\n".join(p.text for p in docx.Document(file_path).paragraphs)
33
- elif ext in [".txt", ".md", ".csv", ".html", ".css", ".js", ".py"]:
34
- with open(file_path, "r", encoding="utf-8", errors="ignore") as f: return f.read()
35
- elif ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff"]:
36
- return extract_text_from_image(file_path)
37
- else: return f"Unsupported file type: {ext}"
38
- except Exception as e: return f"Error extracting text: {e}"
39
-
40
- def extract_website_content(url: str) -> str:
41
- try:
42
- headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
43
- response = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
44
- response.raise_for_status()
45
- response.encoding = response.apparent_encoding
46
- soup = BeautifulSoup(response.text, 'html.parser')
47
-
48
- for tag, attr in [('img', 'src'), ('link', 'href'), ('script', 'src')]:
49
- for item in soup.find_all(tag):
50
- if item.has_attr(attr): item[attr] = urljoin(url, item[attr])
51
-
52
- body_content = str(soup)
53
- if len(body_content) > 15000: body_content = body_content[:15000] + "\n<!-- ... HTML truncated ... -->"
54
- return f"<!-- Original URL: {url} -->\n{body_content}"
55
- except Exception as e: return f"Error: Could not fetch content from {url}. Details: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
search_replace.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def apply_search_replace_changes(original_content: str, changes_text: str) -> str:
2
+ """Apply search/replace changes to content (HTML, Python, etc.)"""
3
+ if not changes_text.strip():
4
+ return original_content
5
+ blocks=[]
6
+ current_block=""
7
+ lines=changes_text.split('\n')
8
+ for line in lines:
9
+ if line.strip()==SEARCH_START:
10
+ if current_block.strip(): blocks.append(current_block.strip())
11
+ current_block=line+"\n"
12
+ elif line.strip()==REPLACE_END:
13
+ current_block+=line+"\n"
14
+ blocks.append(current_block.strip())
15
+ current_block=""
16
+ else:
17
+ current_block+=line+"\n"
18
+ if current_block.strip(): blocks.append(current_block.strip())
19
+ modified_content=original_content
20
+ for block in blocks:
21
+ lines=block.split('\n')
22
+ search_lines=[]; replace_lines=[]
23
+ in_search=False; in_replace=False
24
+ for ln in lines:
25
+ if ln.strip()==SEARCH_START:
26
+ in_search=True; in_replace=False
27
+ elif ln.strip()==DIVIDER:
28
+ in_search=False; in_replace=True
29
+ elif ln.strip()==REPLACE_END:
30
+ in_replace=False
31
+ elif in_search:
32
+ search_lines.append(ln)
33
+ elif in_replace:
34
+ replace_lines.append(ln)
35
+ if search_lines:
36
+ search_text='\n'.join(search_lines).strip()
37
+ replace_text='\n'.join(replace_lines).strip()
38
+ if search_text in modified_content:
39
+ modified_content=modified_content.replace(search_text,replace_text)
40
+ else:
41
+ print(f"Warning: Search text not found in content: {search_text[:100]}...")
42
+ return modified_content
43
+
44
+ def apply_transformers_js_search_replace_changes(original_formatted_content: str, changes_text: str) -> str:
45
+ """Apply search/replace changes to transformers.js formatted content (three files)"""
46
+ if not changes_text.strip():
47
+ return original_formatted_content
48
+ files=parse_transformers_js_output(original_formatted_content)
49
+ blocks=[]; current_block=""
50
+ lines=changes_text.split('\n')
51
+ for line in lines:
52
+ if line.strip()==SEARCH_START:
53
+ if current_block.strip(): blocks.append(current_block.strip())
54
+ current_block=line+"\n"
55
+ elif line.strip()==REPLACE_END:
56
+ current_block+=line+"\n"
57
+ blocks.append(current_block.strip())
58
+ current_block=""
59
+ else:
60
+ current_block+=line+"\n"
61
+ if current_block.strip(): blocks.append(current_block.strip())
62
+ for block in blocks:
63
+ lines=block.split('\n')
64
+ search_lines=[]; replace_lines=[]
65
+ in_search=False; in_replace=False; target_file=None
66
+ for ln in lines:
67
+ if ln.strip()==SEARCH_START:
68
+ in_search=True; in_replace=False
69
+ elif ln.strip()==DIVIDER:
70
+ in_search=False; in_replace=True
71
+ elif ln.strip()==REPLACE_END:
72
+ in_replace=False
73
+ elif in_search:
74
+ search_lines.append(ln)
75
+ elif in_replace:
76
+ replace_lines.append(ln)
77
+ if search_lines:
78
+ search_text='\n'.join(search_lines).strip()
79
+ replace_text='\n'.join(replace_lines).strip()
80
+ if search_text in files['index.html']:
81
+ target_file='index.html'
82
+ elif search_text in files['index.js']:
83
+ target_file='index.js'
84
+ elif search_text in files['style.css']:
85
+ target_file='style.css'
86
+ if target_file and search_text in files[target_file]:
87
+ files[target_file]=files[target_file].replace(search_text,replace_text)
88
+ else:
89
+ print(f"Warning: Search text not found in any transformers.js file: {search_text[:100]}...")
90
+ return format_transformers_js_output(files)