Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,413 Bytes
d1ed09d d934c35 d1ed09d d934c35 d1ed09d d934c35 d1ed09d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import numpy as np
from urllib.parse import urlparse, parse_qs
from pyalex import Works
import pandas as pd
def openalex_url_to_pyalex_query(url):
"""
Convert an OpenAlex search URL to a pyalex query.
Args:
url (str): The OpenAlex search URL.
Returns:
tuple: (Works object, dict of parameters)
"""
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Initialize the Works object
query = Works()
# Handle filters
if 'filter' in query_params:
filters = query_params['filter'][0].split(',')
for f in filters:
if ':' in f:
key, value = f.split(':', 1)
if key == 'default.search':
query = query.search(value)
else:
query = query.filter(**{key: value})
# Handle sort - Fixed to properly handle field:direction format
if 'sort' in query_params:
sort_params = query_params['sort'][0].split(',')
for s in sort_params:
if ':' in s: # Handle field:direction format
field, direction = s.split(':')
query = query.sort(**{field: direction})
elif s.startswith('-'): # Handle -field format
query = query.sort(**{s[1:]: 'desc'})
else: # Handle field format
query = query.sort(**{s: 'asc'})
# Handle other parameters
params = {}
for key in ['page', 'per-page', 'sample', 'seed']:
if key in query_params:
params[key] = query_params[key][0]
return query, params
def invert_abstract(inv_index):
"""Reconstruct abstract from inverted index."""
if inv_index is not None:
l_inv = [(w, p) for w, pos in inv_index.items() for p in pos]
return " ".join(map(lambda x: x[0], sorted(l_inv, key=lambda x: x[1])))
else:
return ' '
def get_pub(x):
"""Extract publication name from record."""
try:
source = x['source']['display_name']
if source not in ['parsed_publication','Deleted Journal']:
return source
else:
return ' '
except:
return ' '
def get_field(x):
"""Extract academic field from record."""
try:
field = x['primary_topic']['subfield']['display_name']
if field is not None:
return field
else:
return np.nan
except:
return np.nan
def process_records_to_df(records):
"""
Convert OpenAlex records to a pandas DataFrame with processed fields.
Args:
records (list): List of OpenAlex record dictionaries
Returns:
pandas.DataFrame: Processed DataFrame with abstracts, publications, and titles
"""
records_df = pd.DataFrame(records)
records_df['abstract'] = [invert_abstract(t) for t in records_df['abstract_inverted_index']]
records_df['parsed_publication'] = [get_pub(x) for x in records_df['primary_location']]
records_df['parsed_publication'] = records_df['parsed_publication'].fillna(' ')
records_df['abstract'] = records_df['abstract'].fillna(' ')
records_df['title'] = records_df['title'].fillna(' ')
return records_df
def openalex_url_to_filename(url):
"""
Convert an OpenAlex URL to a filename-safe string with timestamp.
Args:
url (str): The OpenAlex search URL
Returns:
str: A filename-safe string with timestamp (without extension)
"""
from datetime import datetime
import re
# First parse the URL into query and params
parsed_url = urlparse(url)
query_params = parse_qs(parsed_url.query)
# Create parts of the filename
parts = []
# Handle filters
if 'filter' in query_params:
filters = query_params['filter'][0].split(',')
for f in filters:
if ':' in f:
key, value = f.split(':', 1)
# Replace dots with underscores and clean the value
key = key.replace('.', '_')
# Clean the value to be filename-safe and add spaces around words
clean_value = re.sub(r'[^\w\s-]', '', value)
# Replace multiple spaces with single space and strip
clean_value = ' '.join(clean_value.split())
# Replace spaces with underscores for filename
clean_value = clean_value.replace(' ', '_')
if key == 'default_search':
parts.append(f"search_{clean_value}")
else:
parts.append(f"{key}_{clean_value}")
# Handle sort parameters
if 'sort' in query_params:
sort_params = query_params['sort'][0].split(',')
for s in sort_params:
if s.startswith('-'):
parts.append(f"sort_{s[1:].replace('.', '_')}_desc")
else:
parts.append(f"sort_{s.replace('.', '_')}_asc")
# Add timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
# Combine all parts
filename = '__'.join(parts) if parts else 'openalex_query'
filename = f"{filename}__{timestamp}"
# Ensure filename is not too long (max 255 chars is common filesystem limit)
if len(filename) > 255:
filename = filename[:251] # leave room for potential extension
return filename |