diff --git a/src/osw/wiki_tools.py b/src/osw/wiki_tools.py index 5766b81..db20fea 100644 --- a/src/osw/wiki_tools.py +++ b/src/osw/wiki_tools.py @@ -150,7 +150,7 @@ def __init__(self, **data): def prefix_search( site: mwclient.client.Site, text: Union[str, SearchParam] -) -> List[str]: +) -> Union[List[str], List[dict]]: """Standard query. Equivalent to the following mediawiki API call api.php?action=query&list=prefixsearch&pssearch=Star Wars. @@ -165,15 +165,17 @@ def prefix_search( Returns ------- - page_list : - List of page titles + result: + With ``return_json=False`` (default): a flat list of page titles. With + ``return_json=True``: a list of raw MediaWiki ``prefixsearch`` API response + dicts, one per query (always a list, even for a single query). """ if not isinstance(text, SearchParam): query = SearchParam(query=text) else: query = text - def prefix_search_(single_text): + def prefix_search_(single_text) -> Union[List[str], dict]: page_list = list() result = site.api( "query", @@ -182,15 +184,16 @@ def prefix_search_(single_text): pslimit=query.limit, format="json", ) - if len(result["query"]["prefixsearch"]) == 0: + if query.debug and len(result["query"]["prefixsearch"]) == 0: + print("No results") + if query.return_json: + return result + + for page in result["query"]["prefixsearch"]: + title = page["title"] if query.debug: - print("No results") - else: - for page in result["query"]["prefixsearch"]: - title = page["title"] - if query.debug: - print(title) - page_list.append(title) + print(title) + page_list.append(title) return page_list if query.parallel: @@ -200,11 +203,17 @@ def prefix_search_(single_text): else: query_results = [prefix_search_(single_text=sq) for sq in query.query] + if query.return_json: + # Each entry of query_results is the raw API response dict for one query. + # Do not flatten dicts; always return the list of responses (one per query), + # even when only a single query was passed. + return query_results + return [item for sublist in query_results for item in sublist] # todo: @Simon: a list of lists of strings (sublist for each query in query list) # or a list of strings (results of all queries combined)? # The last option would not change the behavior of the function, but would - # return page_list # original return + # return page_list # original return def semantic_search( @@ -221,8 +230,10 @@ def semantic_search( Returns ------- - page_list: - List of page titles + result: + With ``return_json=False`` (default): a flat list of page-title fulltext + strings. With ``return_json=True``: a list of raw SMW ``ask`` result dicts, + one per query (always a list, even for a single query). """ if not isinstance(query, SearchParam): query = SearchParam(query=query) @@ -231,27 +242,26 @@ def semantic_search_(single_query): page_list = list() single_query += f"|limit={query.limit}" result = site.api("ask", query=single_query, format="json") - if len(result["query"]["results"]) == 0: - if query.debug: + if query.debug: + if len(result["query"]["results"]) == 0: print("Query '{}' returned no results".format(single_query)) - else: - if query.debug: + else: print( "Query '{}' returned {} results".format( single_query, len(result["query"]["results"]) ) ) - if query.return_json: - return result - - for page in result["query"]["results"].values(): - title = page["fulltext"] - exists = page["exists"] - if "#" not in title and query.debug: - print(title) - # original position of "page_list.append(title)" line - if exists == "1": - page_list.append(title) + if query.return_json: + return result + + for page in result["query"]["results"].values(): + title = page["fulltext"] + exists = page["exists"] + if "#" not in title and query.debug: + print(title) + # original position of "page_list.append(title)" line + if exists == "1": + page_list.append(title) return page_list if query.parallel: @@ -261,6 +271,12 @@ def semantic_search_(single_query): else: query_results = [semantic_search_(single_query=sq) for sq in query.query] + if query.return_json: + # Each entry of query_results is the raw SMW result dict for one query. + # Do not flatten dicts; always return the list of result dicts (one per + # query), even when only a single query was passed. + return query_results + return [item for sublist in query_results for item in sublist] # todo: @Simon: a list of lists of strings (sublist for each query in query list) # or a list of strings (results of all queries combined)? diff --git a/tests/test_wiki_tools.py b/tests/test_wiki_tools.py index 292b688..ef63834 100644 --- a/tests/test_wiki_tools.py +++ b/tests/test_wiki_tools.py @@ -1,3 +1,5 @@ +from unittest.mock import MagicMock + import osw.wiki_tools as wt @@ -6,3 +8,123 @@ def test_create_flat_content_structure_from_wikitext(): expected = [{"Template": {"param": ["value"]}}] result = wt.create_flat_content_structure_from_wikitext(wikitext) assert result == expected + + +def _ask_result(*titles): + """Build a minimal SMW ``ask`` API result dict for the given page titles.""" + return { + "query": { + "results": { + title: { + "fulltext": title, + "fullurl": f"https://example.org/wiki/{title}", + "namespace": 0, + "exists": "1", + "displaytitle": "", + "printouts": {"SomeProperty": [f"{title}-value"]}, + } + for title in titles + } + } + } + + +def test_semantic_search_return_json_single_query_returns_list_with_dict(): + result = _ask_result("Item:OSW1") + site = MagicMock() + site.api.return_value = result + + out = wt.semantic_search( + site, wt.SearchParam(query="[[HasType::Category:Item]]", return_json=True) + ) + + # A single query still returns a list (of one raw result dict), not a bare dict + # and not a flattened list of keys + assert isinstance(out, list) + assert out == [result] + # printouts must be preserved intact + assert out[0]["query"]["results"]["Item:OSW1"]["printouts"] == { + "SomeProperty": ["Item:OSW1-value"] + } + + +def test_semantic_search_return_json_list_returns_list_of_dicts(): + result_a = _ask_result("Item:OSW1") + result_b = _ask_result("Item:OSW2") + site = MagicMock() + site.api.side_effect = [result_a, result_b] + + out = wt.semantic_search( + site, + wt.SearchParam( + query=["[[HasType::Category:Item]]", "[[HasType::Category:Person]]"], + return_json=True, + ), + ) + + # A list of queries must return a list of result dicts, one per query + assert isinstance(out, list) + assert out == [result_a, result_b] + + +def test_semantic_search_returns_flat_list_of_titles(): + result = _ask_result("Item:OSW1", "Item:OSW2") + site = MagicMock() + site.api.return_value = result + + out = wt.semantic_search(site, "[[HasType::Category:Item]]") + + # return_json=False (the default) still yields a flat list of page titles + assert out == ["Item:OSW1", "Item:OSW2"] + + +def _prefixsearch_result(*titles): + """Build a minimal MediaWiki ``prefixsearch`` API result dict.""" + return { + "batchcomplete": "", + "query": { + "prefixsearch": [ + {"ns": 0, "title": title, "pageid": idx} + for idx, title in enumerate(titles, start=1) + ] + }, + } + + +def test_prefix_search_return_json_single_query_returns_list_with_full_response(): + result = _prefixsearch_result("Star Wars", "Star Trek") + site = MagicMock() + site.api.return_value = result + + out = wt.prefix_search(site, wt.SearchParam(query="Star", return_json=True)) + + # A single query returns a list of one full API response dict, not a flattened + # list of individual prefixsearch entries + assert isinstance(out, list) + assert out == [result] + assert out[0]["query"]["prefixsearch"][0]["title"] == "Star Wars" + + +def test_prefix_search_return_json_list_returns_list_of_full_responses(): + result_a = _prefixsearch_result("Star Wars") + result_b = _prefixsearch_result("Voyager") + site = MagicMock() + site.api.side_effect = [result_a, result_b] + + out = wt.prefix_search( + site, wt.SearchParam(query=["Star", "Voy"], return_json=True) + ) + + # One full API response dict per query, not merged/flattened across queries + assert out == [result_a, result_b] + + +def test_prefix_search_returns_flat_list_of_titles(): + result = _prefixsearch_result("Star Wars", "Star Trek") + site = MagicMock() + site.api.return_value = result + + out = wt.prefix_search(site, "Star") + + # return_json=False (the default) still yields a flat list of page titles + assert out == ["Star Wars", "Star Trek"]