Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug of Gerapy Auto Extractor about similarity2 #24

Open
wf4867612 opened this issue Aug 25, 2022 · 2 comments
Open

Bug of Gerapy Auto Extractor about similarity2 #24

wf4867612 opened this issue Aug 25, 2022 · 2 comments
Assignees
Labels
bug Something isn't working

Comments

@wf4867612
Copy link

def similarity2(s1, s2):
"""
get similarity of two strings
:param s1:
:param s2:
:return:
"""
if not s1 or not s2:
return 0
s1_set = set(list(s1))
s2_set = set(list(s2))
intersection = s1_set.intersection(s2_set)
union = s1_set.intersection(s2_set)
return len(intersection) / len(union)

union = s1_set.intersection(s2_set)    # 这个应该是并集才对吧,源码里边应该是取错了
@wf4867612 wf4867612 added the bug Something isn't working label Aug 25, 2022
@cbc123
Copy link

cbc123 commented Nov 13, 2022

我也发现这个问题了

@cbc123
Copy link

cbc123 commented Nov 14, 2022

from gerapy_auto_extractor.extractors.base import BaseExtractor
from lxml.html import HtmlElement, fromstring
from gerapy_auto_extractor.patterns.title import METAS
from gerapy_auto_extractor.utils.lcs import lcs_of_2
from gerapy_auto_extractor.utils.similarity import similarity2

class TitleExtractor(BaseExtractor):
"""
Title Extractor which extract title of page
"""

def extract_by_meta(self, element: HtmlElement) -> str:
    """
    extract according to meta
    :param element:
    :return: str
    """
    for xpath in METAS:
        title = element.xpath(xpath)
        if title:
            return ''.join(title)

def extract_by_title(self, element: HtmlElement):
    """
    get title from <title> tag
    :param element:
    :return:
    """
    return ''.join(element.xpath('//title//text()')).strip()

def extract_by_hs(self, element: HtmlElement):
    """
    get title from all h1-h3 tag
    :param element:
    :return:
    """
    hs = element.xpath('//h1//text()|//h2//text()|//h3//text()')
    return hs or []

def extract_by_h(self, element: HtmlElement):
    """
    extract by h tag, priority h1, h2, h3
    :param elemeent:
    :return:
    """
    for xpath in ['//h1', '//h2', '//h3']:
        children = element.xpath(xpath)
        if not children:
            continue
        child = children[0]
        texts = child.xpath('./text()')
        if texts and len(texts):
            return texts[0].strip()

def process(self, element: HtmlElement):
    """
    extract title from element
    :param element:
    :return:
    """
    title_extracted_by_meta = self.extract_by_meta(element)
    title_extracted_by_h = self.extract_by_h(element)
    title_extracted_by_hs = self.extract_by_hs(element)
    title_extracted_by_title = self.extract_by_title(element)

    # split logic to add more
    if title_extracted_by_meta:
        return title_extracted_by_meta

    title_extracted_by_hs = sorted(title_extracted_by_hs, key=lambda x: len(x), reverse=True)  # 最长字符的标签放前面
    max_dict = {}
    for index, h in enumerate(title_extracted_by_hs):
        jd = similarity2(h, title_extracted_by_title)
        if jd > 0:
            max_dict[index] = int(jd)
    key = 10
    for k, v in max_dict.items():
        if v == max(max_dict.values()):
            key = k
    print(title_extracted_by_hs[key])

    if title_extracted_by_hs:
        return title_extracted_by_hs[key]

    return title_extracted_by_title

title_extractor = TitleExtractor()

def extract_title(html):
"""
extract title from html
:param html:
:return:
"""
result = title_extractor.extract(html)
return result

#把title 替换成我这个就解决了

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

3 participants