Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 46 additions & 30 deletions src/osw/wiki_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def __init__(self, **data):

def prefix_search(
site: mwclient.client.Site, text: Union[str, SearchParam]
) -> List[str]:
) -> Union[List[str], List[dict]]:
"""Standard query. Equivalent to the following mediawiki API call
api.php?action=query&list=prefixsearch&pssearch=Star Wars.

Expand All @@ -165,15 +165,17 @@ def prefix_search(

Returns
-------
page_list :
List of page titles
result:
With ``return_json=False`` (default): a flat list of page titles. With
``return_json=True``: a list of raw MediaWiki ``prefixsearch`` API response
dicts, one per query (always a list, even for a single query).
"""
if not isinstance(text, SearchParam):
query = SearchParam(query=text)
else:
query = text

def prefix_search_(single_text):
def prefix_search_(single_text) -> Union[List[str], dict]:
page_list = list()
result = site.api(
"query",
Expand All @@ -182,15 +184,16 @@ def prefix_search_(single_text):
pslimit=query.limit,
format="json",
)
if len(result["query"]["prefixsearch"]) == 0:
if query.debug and len(result["query"]["prefixsearch"]) == 0:
print("No results")
if query.return_json:
return result

for page in result["query"]["prefixsearch"]:
title = page["title"]
if query.debug:
print("No results")
else:
for page in result["query"]["prefixsearch"]:
title = page["title"]
if query.debug:
print(title)
page_list.append(title)
print(title)
page_list.append(title)
return page_list

if query.parallel:
Expand All @@ -200,11 +203,17 @@ def prefix_search_(single_text):
else:
query_results = [prefix_search_(single_text=sq) for sq in query.query]

if query.return_json:
# Each entry of query_results is the raw API response dict for one query.
# Do not flatten dicts; always return the list of responses (one per query),
# even when only a single query was passed.
return query_results

return [item for sublist in query_results for item in sublist]
# todo: @Simon: a list of lists of strings (sublist for each query in query list)
# or a list of strings (results of all queries combined)?
# The last option would not change the behavior of the function, but would
# return page_list # original return
# return page_list # original return


def semantic_search(
Expand All @@ -221,8 +230,10 @@ def semantic_search(

Returns
-------
page_list:
List of page titles
result:
With ``return_json=False`` (default): a flat list of page-title fulltext
strings. With ``return_json=True``: a list of raw SMW ``ask`` result dicts,
one per query (always a list, even for a single query).
"""
if not isinstance(query, SearchParam):
query = SearchParam(query=query)
Expand All @@ -231,27 +242,26 @@ def semantic_search_(single_query):
page_list = list()
single_query += f"|limit={query.limit}"
result = site.api("ask", query=single_query, format="json")
if len(result["query"]["results"]) == 0:
if query.debug:
if query.debug:
if len(result["query"]["results"]) == 0:
print("Query '{}' returned no results".format(single_query))
else:
if query.debug:
else:
print(
"Query '{}' returned {} results".format(
single_query, len(result["query"]["results"])
)
)
if query.return_json:
return result

for page in result["query"]["results"].values():
title = page["fulltext"]
exists = page["exists"]
if "#" not in title and query.debug:
print(title)
# original position of "page_list.append(title)" line
if exists == "1":
page_list.append(title)
if query.return_json:
return result

for page in result["query"]["results"].values():
title = page["fulltext"]
exists = page["exists"]
if "#" not in title and query.debug:
print(title)
# original position of "page_list.append(title)" line
if exists == "1":
page_list.append(title)
return page_list

if query.parallel:
Expand All @@ -261,6 +271,12 @@ def semantic_search_(single_query):
else:
query_results = [semantic_search_(single_query=sq) for sq in query.query]

if query.return_json:
# Each entry of query_results is the raw SMW result dict for one query.
# Do not flatten dicts; always return the list of result dicts (one per
# query), even when only a single query was passed.
return query_results

return [item for sublist in query_results for item in sublist]
# todo: @Simon: a list of lists of strings (sublist for each query in query list)
# or a list of strings (results of all queries combined)?
Expand Down
122 changes: 122 additions & 0 deletions tests/test_wiki_tools.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from unittest.mock import MagicMock

import osw.wiki_tools as wt


Expand All @@ -6,3 +8,123 @@ def test_create_flat_content_structure_from_wikitext():
expected = [{"Template": {"param": ["value"]}}]
result = wt.create_flat_content_structure_from_wikitext(wikitext)
assert result == expected


def _ask_result(*titles):
"""Build a minimal SMW ``ask`` API result dict for the given page titles."""
return {
"query": {
"results": {
title: {
"fulltext": title,
"fullurl": f"https://example.org/wiki/{title}",
"namespace": 0,
"exists": "1",
"displaytitle": "",
"printouts": {"SomeProperty": [f"{title}-value"]},
}
for title in titles
}
}
}


def test_semantic_search_return_json_single_query_returns_list_with_dict():
result = _ask_result("Item:OSW1")
site = MagicMock()
site.api.return_value = result

out = wt.semantic_search(
site, wt.SearchParam(query="[[HasType::Category:Item]]", return_json=True)
)

# A single query still returns a list (of one raw result dict), not a bare dict
# and not a flattened list of keys
assert isinstance(out, list)
assert out == [result]
# printouts must be preserved intact
assert out[0]["query"]["results"]["Item:OSW1"]["printouts"] == {
"SomeProperty": ["Item:OSW1-value"]
}


def test_semantic_search_return_json_list_returns_list_of_dicts():
result_a = _ask_result("Item:OSW1")
result_b = _ask_result("Item:OSW2")
site = MagicMock()
site.api.side_effect = [result_a, result_b]

out = wt.semantic_search(
site,
wt.SearchParam(
query=["[[HasType::Category:Item]]", "[[HasType::Category:Person]]"],
return_json=True,
),
)

# A list of queries must return a list of result dicts, one per query
assert isinstance(out, list)
assert out == [result_a, result_b]


def test_semantic_search_returns_flat_list_of_titles():
result = _ask_result("Item:OSW1", "Item:OSW2")
site = MagicMock()
site.api.return_value = result

out = wt.semantic_search(site, "[[HasType::Category:Item]]")

# return_json=False (the default) still yields a flat list of page titles
assert out == ["Item:OSW1", "Item:OSW2"]


def _prefixsearch_result(*titles):
"""Build a minimal MediaWiki ``prefixsearch`` API result dict."""
return {
"batchcomplete": "",
"query": {
"prefixsearch": [
{"ns": 0, "title": title, "pageid": idx}
for idx, title in enumerate(titles, start=1)
]
},
}


def test_prefix_search_return_json_single_query_returns_list_with_full_response():
result = _prefixsearch_result("Star Wars", "Star Trek")
site = MagicMock()
site.api.return_value = result

out = wt.prefix_search(site, wt.SearchParam(query="Star", return_json=True))

# A single query returns a list of one full API response dict, not a flattened
# list of individual prefixsearch entries
assert isinstance(out, list)
assert out == [result]
assert out[0]["query"]["prefixsearch"][0]["title"] == "Star Wars"


def test_prefix_search_return_json_list_returns_list_of_full_responses():
result_a = _prefixsearch_result("Star Wars")
result_b = _prefixsearch_result("Voyager")
site = MagicMock()
site.api.side_effect = [result_a, result_b]

out = wt.prefix_search(
site, wt.SearchParam(query=["Star", "Voy"], return_json=True)
)

# One full API response dict per query, not merged/flattened across queries
assert out == [result_a, result_b]


def test_prefix_search_returns_flat_list_of_titles():
result = _prefixsearch_result("Star Wars", "Star Trek")
site = MagicMock()
site.api.return_value = result

out = wt.prefix_search(site, "Star")

# return_json=False (the default) still yields a flat list of page titles
assert out == ["Star Wars", "Star Trek"]
Loading