Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
edugain
eduGAIN Connectivity Check
Commits
11d4e71c
Commit
11d4e71c
authored
Jul 23, 2020
by
Marco Malavolti
Browse files
Added IdP exclusion with robots.txt
parent
226eb373
Changes
3
Hide whitespace changes
Inline
Side-by-side
README.md
View file @
11d4e71c
...
...
@@ -52,7 +52,12 @@ There are some situations where the check cannot work reliably. In those cases i
# Disable Checks
In cases where an IdP cannot be reliably checked, it is necessary to create, also empty,
`eccs-disabled.txt`
file on IdP's web root.
In cases where an IdP cannot be reliably checked, it is necessary to create or enrich the
`robots.txt`
file on the IdP's web root with:
```
bash
User-agent: ECCS
Disallow: /
```
# On-line interface
...
...
eccs2.py
View file @
11d4e71c
...
...
@@ -6,7 +6,7 @@ import json
import
re
import
requests
from
eccs2properties
import
DAY
,
ECCS2HTMLDIR
,
ECCS2OUTPUTDIR
,
ECCS2RESULTSLOG
,
FEDS_BLACKLIST
,
IDPS_BLACKLIST
,
ECCS2SPS
,
ECCS2SELENIUMDEBUG
from
eccs2properties
import
DAY
,
ECCS2HTMLDIR
,
ECCS2OUTPUTDIR
,
ECCS2RESULTSLOG
,
FEDS_BLACKLIST
,
IDPS_BLACKLIST
,
ECCS2SPS
,
ECCS2SELENIUMDEBUG
,
ROBOTS_USER_AGENT
from
pathlib
import
Path
from
selenium.common.exceptions
import
TimeoutException
from
urllib3.util
import
parse_url
...
...
@@ -26,6 +26,15 @@ def getIDPfqdn(entityIDidp):
else
:
return
entityIDidp
.
split
(
":"
)[
-
1
]
# Return True if the ECCS check MUST not be run
def
checkRobots
(
url_robots_txt
):
robots_txt
=
requests
.
get
(
url_robots_txt
)
p
=
re
.
compile
(
'^User-agent:\sECCS\sDisallow:\s\/\s*$'
,
re
.
MULTILINE
)
m
=
p
.
search
(
robots_txt
.
text
)
if
(
m
):
return
True
else
:
return
False
# The function check that the IdP recognized the SP by presenting its Login page.
# If the IdP Login page contains "username" and "password" fields, than the test is passed.
...
...
@@ -51,36 +60,44 @@ def checkIdP(sp,idp,test):
fqdn_sp
=
parse_url
(
sp
)[
2
]
wayfless_url
=
sp
+
idp
[
'entityID'
]
exclude_idp
=
""
robots
=
""
try
:
headers
=
{
'User-Agent'
:
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
}
exclude_idp
=
requests
.
get
(
"https://%s/eccs-disabled.txt"
%
fqdn_idp
,
headers
=
headers
,
verify
=
False
,
timeout
=
30
)
if
(
exclude_idp
==
""
):
exclude_idp
=
requests
.
get
(
"http://%s/eccs-disabled.txt"
%
fqdn_idp
,
headers
=
headers
,
verify
=
False
,
timeout
=
30
)
headers
=
{
'User-Agent'
:
'%s'
%
ROBOTS_USER_AGENT
}
except
requests
.
exceptions
.
ConnectionError
as
e
:
print
(
"!!! ECCS-DISABLED REQUESTS CONNECTION ERROR EXCEPTION !!!"
)
#print (e.__str__())
exclude_idp
=
""
robots
=
requests
.
get
(
"https://%s/robots.txt"
%
fqdn_idp
,
headers
=
headers
,
verify
=
True
,
timeout
=
30
)
except
requests
.
exceptions
.
Timeout
as
e
:
print
(
"!!! ECCS-DISABLED REQUESTS TIMEOUT EXCEPTION !!!"
)
#print (e.__str__())
exclude_idp
=
""
if
(
robots
==
""
):
robots
=
requests
.
get
(
"http://%s/robots.txt"
%
fqdn_idp
,
headers
=
headers
,
verify
=
True
,
timeout
=
30
)
if
(
exclude_idp
)
:
except
(
requests
.
exceptions
.
ConnectionError
,
requests
.
exceptions
.
Timeout
,
requests
.
exceptions
.
SSLError
)
as
e
:
check_time
=
datetime
.
datetime
.
utcnow
().
strftime
(
'%Y-%m-%dT%H:%M:%S'
)
+
'Z'
if
(
test
is
not
True
):
with
open
(
"%s/%s/%s---%s.html"
%
(
ECCS2HTMLDIR
,
DAY
,
fqdn_idp
,
fqdn_sp
),
"w"
)
as
html
:
html
.
write
(
"IdP excluded from check b
y eccs-disabled.txt"
)
html
.
write
(
"IdP excluded from check b
ecause the download of 'robots.txt' failed: %s"
%
e
.
__str__
()
)
else
:
print
(
"IdP excluded from check b
y eccs-disabled.txt"
)
print
(
"IdP excluded from check b
ecause the download of 'robots.txt' failed: %s"
%
e
.
__str__
()
)
return
(
idp
[
'entityID'
],
wayfless_url
,
check_time
,
"NULL"
,
"DISABLED"
)
if
(
robots
):
check_time
=
datetime
.
datetime
.
utcnow
().
strftime
(
'%Y-%m-%dT%H:%M:%S'
)
+
'Z'
p
=
re
.
compile
(
'^User-agent:\sECCS\sDisallow:\s\/\s*$'
,
re
.
MULTILINE
)
m
=
p
.
search
(
robots
.
text
)
if
(
m
):
if
(
test
is
not
True
):
with
open
(
"%s/%s/%s---%s.html"
%
(
ECCS2HTMLDIR
,
DAY
,
fqdn_idp
,
fqdn_sp
),
"w"
)
as
html
:
html
.
write
(
"IdP excluded from check by robots.txt"
)
else
:
print
(
"IdP excluded from check by robots.txt"
)
return
(
idp
[
'entityID'
],
wayfless_url
,
check_time
,
"NULL"
,
"DISABLED"
)
if
(
idp
[
'registrationAuthority'
]
in
federation_blacklist
):
check_time
=
datetime
.
datetime
.
utcnow
().
strftime
(
'%Y-%m-%dT%H:%M:%S'
)
+
'Z'
...
...
eccs2properties.py
View file @
11d4e71c
...
...
@@ -39,6 +39,9 @@ ECCS2NUMPROCESSES = 25
# The 2 SPs that will be used to test each IdP
ECCS2SPS
=
[
"https://sp24-test.garr.it/Shibboleth.sso/Login?entityID="
,
"https://attribute-viewer.aai.switch.ch/Shibboleth.sso/Login?entityID="
]
# ROBOTS.TXT
ROBOTS_USER_AGENT
=
"ECCS/2.0 (+https://dev-mm.aai-test.garr.it/eccs2)"
# Registration Authority of Federations to exclude from the check
FEDS_BLACKLIST
=
[
'http://www.surfconext.nl/'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment