Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -288,77 +288,107 @@ def fDistancePlot(text2Party):
|
|
| 288 |
return safe_plot(plot_func)
|
| 289 |
|
| 290 |
def DispersionPlot(textParty):
|
| 291 |
-
"""
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
| 293 |
try:
|
| 294 |
-
word_tokens_party = word_tokenize(textParty)
|
| 295 |
-
print(f"Debug DispersionPlot: Total tokens: {len(word_tokens_party)}")
|
| 296 |
if not word_tokens_party:
|
| 297 |
print("Warning: No tokens found for dispersion plot.")
|
| 298 |
return None
|
| 299 |
|
| 300 |
-
moby = Text(word_tokens_party)
|
| 301 |
fdistance = FreqDist(word_tokens_party)
|
| 302 |
-
print(f"Debug DispersionPlot: FreqDist sample: {list(fdistance.most_common(10))}")
|
| 303 |
|
| 304 |
# --- Improved word selection logic ---
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
| 310 |
|
| 311 |
# Select top 5 from filtered list
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
else:
|
| 315 |
-
word_Lst = [common_words_filtered[x][0] for x in range(5)]
|
| 316 |
-
|
| 317 |
-
# Final check: Ensure words are present in the Text object (moby)
|
| 318 |
-
final_word_list = [word for word in word_Lst if word in moby] # Check membership in the Text object
|
| 319 |
-
print(f"Debug DispersionPlot: Final word list for plot: {final_word_list}") # Debug print
|
| 320 |
|
| 321 |
if not final_word_list:
|
| 322 |
-
print("Warning: No suitable words found for dispersion plot
|
| 323 |
# Create a simple plot indicating no data
|
| 324 |
fig, ax = plt.subplots(figsize=(8, 3))
|
| 325 |
ax.text(0.5, 0.5, "No suitable words found for dispersion plot", ha='center', va='center', transform=ax.transAxes)
|
| 326 |
ax.set_xlim(0, 1)
|
| 327 |
ax.set_ylim(0, 1)
|
| 328 |
-
ax.axis('off')
|
| 329 |
fig.suptitle('Dispersion Plot')
|
| 330 |
else:
|
| 331 |
-
# ---
|
| 332 |
-
fig = plt.
|
| 333 |
-
|
| 334 |
-
#
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
plt.tight_layout()
|
| 337 |
|
| 338 |
buf = BytesIO()
|
| 339 |
-
# Handle potential apply_aspect error
|
| 340 |
try:
|
| 341 |
-
fig.savefig(buf, format='png', bbox_inches='tight')
|
| 342 |
except AttributeError as ae:
|
| 343 |
if "apply_aspect" in str(ae):
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
else:
|
| 349 |
-
|
| 350 |
buf.seek(0)
|
| 351 |
img = Image.open(buf)
|
| 352 |
-
plt.close(fig)
|
| 353 |
return img
|
| 354 |
|
| 355 |
except Exception as e:
|
| 356 |
print(f"Dispersion plot error: {e}")
|
| 357 |
if buf:
|
| 358 |
-
buf.close()
|
| 359 |
traceback.print_exc()
|
| 360 |
-
plt.close('all')
|
| 361 |
-
return None
|
|
|
|
| 362 |
|
| 363 |
def word_cloud_generator(parsed_text_name, text_Party):
|
| 364 |
"""Generates the word cloud image."""
|
|
|
|
| 288 |
return safe_plot(plot_func)
|
| 289 |
|
| 290 |
def DispersionPlot(textParty):
|
| 291 |
+
"""
|
| 292 |
+
Generates a dispersion plot using Matplotlib.
|
| 293 |
+
Shows the positions of the most common words along the text.
|
| 294 |
+
"""
|
| 295 |
+
buf = None
|
| 296 |
try:
|
| 297 |
+
word_tokens_party = word_tokenize(textParty.lower()) # Lowercase for matching
|
| 298 |
+
print(f"Debug DispersionPlot: Total tokens: {len(word_tokens_party)}")
|
| 299 |
if not word_tokens_party:
|
| 300 |
print("Warning: No tokens found for dispersion plot.")
|
| 301 |
return None
|
| 302 |
|
|
|
|
| 303 |
fdistance = FreqDist(word_tokens_party)
|
| 304 |
+
print(f"Debug DispersionPlot: FreqDist sample: {list(fdistance.most_common(10))}")
|
| 305 |
|
| 306 |
# --- Improved word selection logic ---
|
| 307 |
+
common_words_raw = fdistance.most_common(15)
|
| 308 |
+
# Filter words: length > 2, alphabetic, not just digits
|
| 309 |
+
common_words_filtered = [
|
| 310 |
+
(word, freq) for word, freq in common_words_raw
|
| 311 |
+
if len(word) > 2 and word.isalpha() and not word.isdigit()
|
| 312 |
+
]
|
| 313 |
+
print(f"Debug DispersionPlot: Filtered common words: {common_words_filtered}")
|
| 314 |
|
| 315 |
# Select top 5 from filtered list
|
| 316 |
+
final_word_list = [word for word, _ in common_words_filtered[:5]]
|
| 317 |
+
print(f"Debug DispersionPlot: Final word list for plot: {final_word_list}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
if not final_word_list:
|
| 320 |
+
print("Warning: No suitable words found for dispersion plot.")
|
| 321 |
# Create a simple plot indicating no data
|
| 322 |
fig, ax = plt.subplots(figsize=(8, 3))
|
| 323 |
ax.text(0.5, 0.5, "No suitable words found for dispersion plot", ha='center', va='center', transform=ax.transAxes)
|
| 324 |
ax.set_xlim(0, 1)
|
| 325 |
ax.set_ylim(0, 1)
|
| 326 |
+
ax.axis('off')
|
| 327 |
fig.suptitle('Dispersion Plot')
|
| 328 |
else:
|
| 329 |
+
# --- Create the dispersion plot manually ---
|
| 330 |
+
fig, ax = plt.subplots(figsize=(12, 6))
|
| 331 |
+
|
| 332 |
+
# X-axis: position in the text (token index)
|
| 333 |
+
x = list(range(len(word_tokens_party)))
|
| 334 |
+
|
| 335 |
+
# Y-axis: will be offset for each word for visualization
|
| 336 |
+
# We'll plot a scatter point for each occurrence of the target words
|
| 337 |
+
colors = plt.cm.get_cmap('tab10', len(final_word_list))
|
| 338 |
+
|
| 339 |
+
for i, word in enumerate(final_word_list):
|
| 340 |
+
# Find all indices where the word occurs
|
| 341 |
+
offsets = [j for j, token in enumerate(word_tokens_party) if token == word]
|
| 342 |
+
y_positions = [i + 1] * len(offsets) # Offset y-position for each word
|
| 343 |
+
ax.scatter(offsets, y_positions, label=word, color=colors(i), alpha=0.7, s=30) # s is marker size
|
| 344 |
+
|
| 345 |
+
ax.set_xlabel("Position in Text (Token Index)")
|
| 346 |
+
ax.set_ylabel("Words")
|
| 347 |
+
ax.set_title("Dispersion Plot")
|
| 348 |
+
|
| 349 |
+
# Set y-ticks to correspond to the words
|
| 350 |
+
ax.set_yticks(range(1, len(final_word_list) + 1))
|
| 351 |
+
ax.set_yticklabels(final_word_list)
|
| 352 |
+
|
| 353 |
+
# Invert y-axis so the first word in the list is at the top
|
| 354 |
+
ax.invert_yaxis()
|
| 355 |
+
|
| 356 |
+
# Add grid for better readability
|
| 357 |
+
ax.grid(True, axis='x', linestyle='--', alpha=0.5)
|
| 358 |
+
|
| 359 |
+
# Add legend
|
| 360 |
+
# ax.legend(title="Words", bbox_to_anchor=(1.05, 1), loc='upper left') # Place legend outside plot
|
| 361 |
+
# Or, include legend inside if space allows and it's not too cluttered
|
| 362 |
+
# For simplicity inside the plot area (adjust if needed)
|
| 363 |
+
# ax.legend(title="Words")
|
| 364 |
+
|
| 365 |
plt.tight_layout()
|
| 366 |
|
| 367 |
buf = BytesIO()
|
| 368 |
+
# Handle potential apply_aspect error
|
| 369 |
try:
|
| 370 |
+
fig.savefig(buf, format='png', bbox_inches='tight', dpi=150) # Added dpi for clarity
|
| 371 |
except AttributeError as ae:
|
| 372 |
if "apply_aspect" in str(ae):
|
| 373 |
+
print(f"Warning: bbox_inches='tight' failed for Dispersion Plot ({ae}), saving without it.")
|
| 374 |
+
buf.seek(0)
|
| 375 |
+
buf = BytesIO()
|
| 376 |
+
fig.savefig(buf, format='png', dpi=150)
|
| 377 |
else:
|
| 378 |
+
raise
|
| 379 |
buf.seek(0)
|
| 380 |
img = Image.open(buf)
|
| 381 |
+
plt.close(fig)
|
| 382 |
return img
|
| 383 |
|
| 384 |
except Exception as e:
|
| 385 |
print(f"Dispersion plot error: {e}")
|
| 386 |
if buf:
|
| 387 |
+
buf.close()
|
| 388 |
traceback.print_exc()
|
| 389 |
+
plt.close('all')
|
| 390 |
+
return None
|
| 391 |
+
|
| 392 |
|
| 393 |
def word_cloud_generator(parsed_text_name, text_Party):
|
| 394 |
"""Generates the word cloud image."""
|