H commited on
Commit
c61ccc8
·
1 Parent(s): e094004

Add component google scholar (#1790)

Browse files

### What problem does this PR solve?

#1739

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

graph/component/__init__.py CHANGED
@@ -16,6 +16,7 @@ from .pubmed import PubMed, PubMedParam
16
  from .arxiv import ArXiv, ArXivParam
17
  from .google import Google, GoogleParam
18
  from .bing import Bing, BingParam
 
19
 
20
 
21
  def component_class(class_name):
 
16
  from .arxiv import ArXiv, ArXivParam
17
  from .google import Google, GoogleParam
18
  from .bing import Bing, BingParam
19
+ from .googlescholar import GoogleScholar, GoogleScholarParam
20
 
21
 
22
  def component_class(class_name):
graph/component/googlescholar.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ from abc import ABC
17
+ import pandas as pd
18
+ from graph.settings import DEBUG
19
+ from graph.component.base import ComponentBase, ComponentParamBase
20
+ from scholarly import scholarly
21
+
22
+
23
+ class GoogleScholarParam(ComponentParamBase):
24
+ """
25
+ Define the GoogleScholar component parameters.
26
+ """
27
+
28
+ def __init__(self):
29
+ super().__init__()
30
+ self.top_n = 6
31
+ self.sort_by = 'relevance'
32
+ self.year_low = None
33
+ self.year_high = None
34
+ self.patents = True
35
+
36
+ def check(self):
37
+ self.check_positive_integer(self.top_n, "Top N")
38
+ self.check_valid_value(self.sort_by, "GoogleScholar Sort_by", ['date', 'relevance'])
39
+ self.check_boolean(self.patents, "Whether or not to include patents, defaults to True")
40
+
41
+
42
+ class GoogleScholar(ComponentBase, ABC):
43
+ component_name = "GoogleScholar"
44
+
45
+ def _run(self, history, **kwargs):
46
+ ans = self.get_input()
47
+ ans = " - ".join(ans["content"]) if "content" in ans else ""
48
+ if not ans:
49
+ return GoogleScholar.be_output("")
50
+
51
+ scholar_client = scholarly.search_pubs(ans, patents=self._param.patents, year_low=self._param.year_low,
52
+ year_high=self._param.year_high, sort_by=self._param.sort_by)
53
+ scholar_res = []
54
+ for i in range(self._param.top_n):
55
+ try:
56
+ pub = next(scholar_client)
57
+ scholar_res.append({"content": 'Title: ' + pub['bib']['title'] + '\n_Url: <a href="' + pub[
58
+ 'pub_url'] + '"></a> ' + "\n author: " + ",".join(pub['bib']['author']) + '\n Abstract: ' + pub[
59
+ 'bib'].get('abstract', 'no abstract')})
60
+
61
+ except StopIteration or Exception as e:
62
+ print("**ERROR** " + str(e))
63
+ break
64
+
65
+ if not scholar_res:
66
+ return GoogleScholar.be_output("")
67
+
68
+ df = pd.DataFrame(scholar_res)
69
+ if DEBUG: print(df, ":::::::::::::::::::::::::::::::::")
70
+ return df
requirements.txt CHANGED
@@ -12,6 +12,7 @@ datrie==0.8.2
12
  demjson3==3.0.6
13
  discord.py==2.3.2
14
  duckduckgo_search==6.1.9
 
15
  elastic_transport==8.12.0
16
  elasticsearch==8.12.1
17
  elasticsearch_dsl==8.12.0
@@ -31,7 +32,9 @@ httpx==0.27.0
31
  huggingface_hub==0.20.3
32
  infinity_emb==0.0.51
33
  itsdangerous==2.1.2
 
34
  Markdown==3.6
 
35
  minio==7.2.4
36
  mistralai==0.4.2
37
  nltk==3.8.1
@@ -51,6 +54,7 @@ pipreqs==0.5.0
51
  protobuf==5.27.2
52
  pyclipper==1.3.0.post5
53
  pycryptodomex==3.20.0
 
54
  PyPDF2==3.0.1
55
  pytest==8.2.2
56
  python-dotenv==1.0.1
@@ -61,6 +65,7 @@ redis==5.0.3
61
  Requests==2.32.2
62
  roman_numbers==1.0.2
63
  ruamel.base==1.0.0
 
64
  scikit_learn==1.5.0
65
  selenium==4.22.0
66
  setuptools==70.0.0
@@ -80,5 +85,3 @@ word2number==1.1
80
  xgboost==2.1.0
81
  xpinyin==0.7.6
82
  zhipuai==2.0.1
83
- pypdf==4.3.0
84
- jina==3.27.2
 
12
  demjson3==3.0.6
13
  discord.py==2.3.2
14
  duckduckgo_search==6.1.9
15
+ editdistance==0.8.1
16
  elastic_transport==8.12.0
17
  elasticsearch==8.12.1
18
  elasticsearch_dsl==8.12.0
 
32
  huggingface_hub==0.20.3
33
  infinity_emb==0.0.51
34
  itsdangerous==2.1.2
35
+ jina==3.27.2
36
  Markdown==3.6
37
+ markdown_to_json==2.1.1
38
  minio==7.2.4
39
  mistralai==0.4.2
40
  nltk==3.8.1
 
54
  protobuf==5.27.2
55
  pyclipper==1.3.0.post5
56
  pycryptodomex==3.20.0
57
+ pypdf==4.3.0
58
  PyPDF2==3.0.1
59
  pytest==8.2.2
60
  python-dotenv==1.0.1
 
65
  Requests==2.32.2
66
  roman_numbers==1.0.2
67
  ruamel.base==1.0.0
68
+ scholarly==1.7.11
69
  scikit_learn==1.5.0
70
  selenium==4.22.0
71
  setuptools==70.0.0
 
85
  xgboost==2.1.0
86
  xpinyin==0.7.6
87
  zhipuai==2.0.1
 
 
requirements_arm.txt CHANGED
@@ -155,4 +155,7 @@ Bio==1.7.1
155
  arxiv==2.1.3
156
  pypdf==4.3.0
157
  google_search_results==2.4.2
158
- jina==3.27.2
 
 
 
 
155
  arxiv==2.1.3
156
  pypdf==4.3.0
157
  google_search_results==2.4.2
158
+ jina==3.27.2
159
+ editdistance==0.8.1
160
+ markdown_to_json==2.1.1
161
+ scholarly==1.7.11
requirements_dev.txt CHANGED
@@ -140,4 +140,7 @@ Bio==1.7.1
140
  arxiv==2.1.3
141
  pypdf==4.3.0
142
  google_search_results==2.4.2
143
- jina==3.27.2
 
 
 
 
140
  arxiv==2.1.3
141
  pypdf==4.3.0
142
  google_search_results==2.4.2
143
+ jina==3.27.2
144
+ editdistance==0.8.1
145
+ markdown_to_json==2.1.1
146
+ scholarly==1.7.11