import sys
import json
import pandas as pd


data_file = sys.argv[2]
down_path = sys.argv[1]

f= open(data_file)
data = json.load(f)

sarticle = data['sart_id']
sjnl = data['source_journal']
sart_title = data['source_articletitle']
sart_abstract = data['source_abstract']
jnls = data['all_journals']
jnl_comparision = jnls.copy()


#Compare source journal with other jounals and preserve in a array
sjnl_cnt = len(sjnl)

for jnl in jnls:
    dup_cnt = 0
    for j in sjnl:
        if sjnl[j] in jnls[jnl]:
             dup_cnt += 1
    
    if dup_cnt != 0:
        jnl_comparision[jnl] =  round((dup_cnt/sjnl_cnt)  * 100,2)
    else:
        jnl_comparision[jnl] = 0
        

#common function for looping
def StrComparision(s_str,d_str):
    try:
        l = 0
        str_len = len(s_str)   
        for i in s_str:
            if i in d_str:
                l +=1
        
        if l != 0:
            return round((l/str_len)*100,2)
        else:
            return 0
    except:
        return 0
        
        
df = pd.DataFrame(data['data_frame'])

df['jnl_journal_name'] = df['jnl_id'].apply(lambda x:jnls[str(x)])
df['journal_name_percentage'] = df['jnl_id'].apply(lambda x:jnl_comparision[str(x)])

#Here Duplicate check article title 
def DupCheck_ATitle(x):
    try:
        Cust_str = data['source_articletitle']
        
        return StrComparision(Cust_str,x)
    except:
        return 0


df['title_percentage'] = list(map(DupCheck_ATitle,df['art_title']))



#Here Duplicate check article abstract 
def DupCheck_abstract(x):
    try:
        Cust_str = data['source_abstract']
        
        return StrComparision(Cust_str,x)
    
    except:
        return 0


df['abstract_percentage'] = list(map(DupCheck_abstract,df['art_abstract']))



#Here Duplicate check article authors 
def DupCheck_authors(x):
    try:
        Cust_str = data['sauthors'].split(',')
        str_len = len(Cust_str)
        x1 =str(x).split(',')
        return StrComparision(Cust_str,x1)
    
    except:
        return 0
    
df['author_persentage'] = list(map(DupCheck_authors,df['art_authors']))


#Here calculate average of duplication
df['averge_per'] = list(map(lambda a,b,c,d:round((a+b+c+d)/4,2),df['journal_name_percentage'],df['title_percentage'],df['abstract_percentage'],df['author_persentage']))


df = df.sort_values(by='averge_per',ignore_index=True,ascending=False)
#df.head()

#save file in json format with naming source article id_result
df.to_json(down_path,orient='records')

f.close()

print('0')
